Filesystem changes using watchdog python and sent the file to SQL Server - sql-server

import pandas
import os
import sqlalchemy
import sys
import time from watchdog.observers
import Observer from watchdog.events
import FileSystemEventHandler
class EventHandler(FileSystemEventHandler):
def on_any_event(self, event):
print("EVENT")
print(event.event_type)
print(event.src_path)
print()
if __name__ == "__main__":
path = 'input path here'
event_handler= EventHandler()
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
print("Monitoring started")
observer.start()
try:
while(True):
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
engine = create_engine('Database information')
cursor = engine.raw_connection().cursor()
for file in os.listdir('.'):
file_basename, extension = file.split('.')
if extension == 'xlsx':
df = pd.read_excel(os.path.abspath(file))
df.to_sql(file_basename, con = engine, if_exists = 'replace')
so the first part where observer.join() ends i am okay with. but the next part where it starts with engine = create_engine("") that where i am having trouble with that part is suppose to send the file to the sql server but the code is not doing that i have researched the internet but having found anything. any help is appreciated.

Related

Apache Flink - WordCount - NoResult - PyFlink

I have developed a Word Count program using PyFlink. The program is not throwing any error yet not providing a desired output. According to the code, the program should create a new text file but no file is generating at the the time of execution. Kindly help, my code is attached below.
from flink.plan.Constants import WriteMode
from flink.plan.Environment import get_environment
from flink.functions.FlatMapFunction import FlatMapFunction
from flink.functions.GroupReduceFunction import GroupReduceFunction
from pyflink import datastream
from pyflink.common import WatermarkStrategy, Encoder, Types
from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode
from pyflink.datastream.connectors import (FileSource, StreamFormat, FileSink, OutputFileConfig, RollingPolicy)
class Tokenizer(FlatMapFunction):
def flat_map(self, value, collector):
super().__init__()
for word in value.lower().split(","):
if len(word)>1:
collector.collect((word, 1))
if __name__ == '__main__':
env = get_environment()
env.set_parallelism(2)
data = env.read_text("h.txt")
tokenized = data.flat_map(Tokenizer())
count = tokenized.group_by(0).sum(1)
count.write_text("D:/Cyber Security/Apache Flink")
Try using env.execute("Word Count Example...") at the end of the program. It kicks off your execution.

How does flink recognize hiveConfDir when running in yarn cluster

I have following code to test flink and hive integration. I submit the application via flink run -m yarn-cluster ..... The hiveConfDir is a local directory that resides on the machine that I submit the application, I would ask how flink can able to read this local directory when the main class is running in the cluster(yarn-cluster)? Thanks!
package org.example.app
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.bridge.scala._
import org.apache.flink.table.catalog.hive.HiveCatalog
import org.apache.flink.types.Row
object FlinkBatchHiveTableIntegrationTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val tenv = StreamTableEnvironment.create(env)
val name = "myHiveCatalog"
val defaultDatabase = "default"
//how does flink could read this local directory
val hiveConfDir = "/apache-hive-2.3.7-bin/conf"
val hive = new HiveCatalog(name, defaultDatabase, hiveConfDir)
tenv.registerCatalog(name, hive)
tenv.useCatalog(name)
val sql =
"""
select * from testdb.t1
""".stripMargin(' ')
val table = tenv.sqlQuery(sql)
table.printSchema()
table.toAppendStream[Row].print()
env.execute("FlinkHiveIntegrationTest")
}
}
Looks I find the answer. The application is submitted with flink run -m yarn-cluster.By this way, the main method of the application is running at the client side where the hive is installed,so the hive conf dir could be read.

Problem to properly connect external realm database file in android kotlin project

I want to connect an external realm database to my Android project. Realm is already set up in build.gradle. I copied test database file: "realmdata.realm" into "raw" folder in "res".
Running the project gives me the error:
Caused by: io.realm.exceptions.RealmFileException: Could not resolve the path to the asset file: realmdata.realm Kind: ACCESS_ERROR.
...
d.androidrealmtestapp.MainActivity.onCreate(MainActivity.kt:40)
...
which corresponds to code line:
realm = Realm.getInstance(c)
No matter if I change filename or position in "res" directory the output is the same. After printing RealmConfiguration the output is: "realmFileName : default.realm" Why "default.realm" since I gave the asset file name: "realmdata.realm"? What am I doing wrong? So my question is how to properly connect an external realm file to the project? I am a beginner in kotlin and realm.
import android.support.v7.app.AppCompatActivity
import android.os.Bundle
import android.support.v7.widget.LinearLayoutManager
import android.support.v7.widget.RecyclerView
import io.realm.Realm
import io.realm.RealmConfiguration
import io.realm.annotations.RealmModule
class MainActivity : AppCompatActivity() {
private lateinit var mainRecycler : RecyclerView
lateinit var text: String
private lateinit var realm : Realm
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
setContentView(R.layout.activity_main)
println("--------------------------------------------- ")
print(application.assets.toString())
Realm.init(this)
var c = RealmConfiguration.Builder()
.assetFile("realmdata.realm")
.modules(MyModule())
.readOnly()
.build()
println("--------------------------------------------- ")
println(" c configuration builder file:")
println(c)
println("--------------------------------------------- ")
Realm.setDefaultConfiguration(c)
realm = Realm.getInstance(c)
realm.beginTransaction()
print ("realm ...")
realm.commitTransaction()
mainRecycler = findViewById(R.id.main_recycler)
mainRecycler.layoutManager = LinearLayoutManager(this)
mainRecycler.adapter = MainAdapter()
}
#RealmModule(classes = arrayOf(RealmModel::class ))
private class MyModule {}
I copied test database file: "realmdata.realm" into "raw" folder in
"res"
You need to copy your database to assets folder
To create assets folder folow this.

Improve RESTful response time

I'm implementing a RESTful Service in Java EE with TomEE+(Apache CXF) which is using a database(specifically postgres). Now I've noticed that the longest time in my functions is spent in the getConnection() call for the database.
my code looks like this:
import java.sql.Connection;
import java.sql.SQLException;
import javax.annotation.Resource;
import javax.sql.DataSource;
import javax.ws.rs.Consumes;
import javax.ws.rs.Path;
import javax.ws.rs.GET;
import javax.ws.rs.Produces;
import javax.ws.rs.core.MediaType;
#Path("/test")
public class TestResource {
/* external DB resource
* configured in the resources.xml as "testDB". Either match the
* name or use the name parameter of the resource annotation.
*/
#Resource private DataSource testDB;
#Path("/hello")
#GET
#Produces("text/plain")
public String test() throws SQLException
{
Connection conn = testDB.getConnection(); //majority of time is spent in here
/*
do something.(e.g. PreparedStatement ps = conn.prepareStatement(...)
*/
conn.close();
return "world";
}
}
The database is defined in a resources.xml like so:
<?xml version="1.0" encoding="UTF-8"?>
<resources>
<Resource id="testDB" type="javax.sql.DataSource">
accessToUnderlyingConnectionAllowed = false
connectionProperties =
defaultAutoCommit = true
defaultReadOnly =
definition =
ignoreDefaultValues = false
initialSize = 0
jdbcDriver = org.postgresql.Driver
jdbcUrl = jdbc:postgresql://localhost/testdb
jtaManaged = true
maxActive = 100
maxIdle = 20
maxOpenPreparedStatements = 0
maxWaitTime = 100 millisecond
minEvictableIdleTime = 30 minutes
minIdle = 0
numTestsPerEvictionRun = 3
password = password
passwordCipher = PlainText
poolPreparedStatements = false
serviceId =
testOnBorrow = true
testOnReturn = false
testWhileIdle = false
timeBetweenEvictionRuns = -1 millisecond
userName = user
validationQuery = SELECT 1;
removeAbandoned = true
removeAbandonedTimeout = 60
logAbandoned = true
</Resource>
</resources>
So how could I reduce the time it takes to get a database connection? I'm already using the connection pooling mechanism. The only solution that comes to my mind is to make the resource class a singleton and get the connection once, but that seems counter-intuitive when many requests need to be worked on.
I would normally expect the interaction with the database to be the slowest part of the process. I would be interested in knowing how much overhead the database call is adding? How is that in relation to making the database call via a SQL tool outside of the Java code.
Also - I'm not sure if the example is just for brevity, but I would try to layer your solution so that the initial layer handles routing and validation, while the next layer handles any business logic and interacts with a database (DAO) layer.

Avoid opening browser on remote server during selenium call

I have written a selenium application using webdriver. I wish to run it on a remote server. When I do that by logging into the server via putty (along with Xming), the selenium tries opening the browser on the server only and load the pages through the external display. However in doing that, it takes a lot of time than if I would have been able to get the browser open on my localhost only (and not the server). Is it possible for such thing to happen or opening on the server only is the only option (which is painfully slow). Kindly tell me if I am missing something as well.
Thanks in advance.
Try using Selenium Grid, instead of Putty, to run your Selenium application on a remote server. The Selenium website has an excellent Quick Start guide for using the Selenium Grid: http://code.google.com/p/selenium/wiki/Grid2.
You can run Selenium with a"headless" driver, HtmlUnitDriver, that does not actually open a browser:
http://code.google.com/p/selenium/wiki/HtmlUnitDriver
Note: HtmlUnitDriver will accept an argument, so that it can emulate a specific driver.
#Lori
I implemented the code but it still tries opening it from putty so takes a lot of time to get the work done. The code is as follows: 'code'
import sys
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from scrapy.http import Request
from selenium import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
class DmozSpider(BaseSpider):
name = "linkedin_crawler"
#defence news
global company
global query
companyFilename = '<filename>'
f=open(companyFilename,"r")
f.seek(0)
company = f.readline().strip()
f.close()
queryFilename = '/var/www/Symantec/recon/recon/' +company+ '/Spider/LinkedIn/query.txt'
f = open(queryFilename)
f.seek(0)
query=f.readline().strip()
f.close()
start_urls = ['https://www.linkedin.com/uas/login'];
def __init__(self):
BaseSpider.__init__(self)
capabilities = webdriver.DesiredCapabilities()
self.selenium = webdriver.Remote(command_executor = 'http://localhost:5000/wd/hub', desired_capabilities = capabilities.FIREFOX)
def __del__(self):
self.selenium.quit()
def parse(self, response):
sel= self.selenium
sel.get(response.url)
global query
elem1 = sel.find_element_by_name("session_key")
elem2 = sel.find_element_by_name("session_password")
elem1.send_keys("myemailid")
elem2.send_keys("mypassword")
elem2.send_keys(Keys.RETURN)
return Request(query, callback=self.page_parse)
def page_parse(self,response):
global query
global company
sel= self.selenium
sel.get(query)
for i in xrange(10):
#for i in xrange(5):
nameFilename = ''
#print hxs
nlist = sel.find_elements_by_xpath('//ol[#class="search-results"]/li/div/h3/a')
fh = open(nameFilename,"a")
for j in xrange(len(nlist)):
url = nlist[j].get_attribute("href").encode('utf-8')
name = nlist[j].text.encode('utf-8')
fh.write(name)
fh.write("<next>")
fh.write(url)
fh.write('\n')
fh.close()
next = sel.find_elements_by_xpath('//a[#class="page-link"]')
next[0].click()
time.sleep(5)
To tun this script on server, I am using putty to fire the command. But then it again uses Xming to open the browser which makes the process slow again. So, how to run the script without opening the browser on my local machine via Xming so that this does not become the bottleneck. Thanks

Resources