I have these csv files on hand that I have to upload to a remote database and I've used pyodbc with the csv library in python to do it.I don't know why but it's insanely slow(about 30 seconds for a 100 rows) and some of these csv files I have to upload have over 30k rows. I've tried using pandas as well but there was no change in speed.
This is more or less my code.Unnecessary parts have been omitted.
if len(sys.argv) == 1:
print("This program needs needs an input state")
exit()
state_code = str(sys.argv[1])
f = open(state_code+".csv", "r")
reader = csv.reader(f, delimiter=',')
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
insert_query = '''INSERT INTO table (Zipcode, Pers_Property_Coverage, Deductible,
Liability,Average_Rate,
Highest_Rate, Lowest_Rate, CREATE_DATE, Active_Flag)
VALUES(?,?,?,?,?,?,?,?,?)'''
for row in reader:
zipcode = row[0]
if len(zipcode) == 4:
zipcode = "0" + zipcode
ppc=row[1][1:]
ppc=ppc.replace(',', '')
deductible = row[2][1:]
deductible = deductible.replace(',', '')
liability = row[3][1:]
liability = liability.replace(',', '')
average_rate = row[4][1:]
average_rate = average_rate.replace(',', '')
highest_rate = row[5][1:]
highest_rate = highest_rate.replace(',', '')
lowest_rate=row[6][1:]
lowest_rate = lowest_rate.replace(',', '')
ctr=ctr+1
if ctr % 100 == 0:
print("Time Elapsed = ", round(time.time() - start_time)," seconds")
values = (zipcode, ppc, deductible, liability, average_rate, highest_rate, lowest_rate, date, "Y")
print("Inserting "+zipcode ,ppc , deductible, liability, average_rate, highest_rate, lowest_rate,date, "Y")
cursor.execute(insert_query, values)
cnxn.commit()
Updating your code to use pyodbc executemany with option fast_executemany=True could be an easy way to save time:
https://github.com/mkleehammer/pyodbc/wiki/Cursor#executemanysql-params-with-fast_executemanytrue
Exploring bulk insert from your file could be another option, although it would most likely not use pyodbc or python:
https://learn.microsoft.com/en-us/sql/t-sql/statements/bulk-insert-transact-sql?view=sql-server-ver15
Related
Stackoverflow supports table markdown. For example, to display a table like this:
N_NATIONKEY
N_NAME
N_REGIONKEY
0
ALGERIA
0
1
ARGENTINA
1
2
BRAZIL
1
3
CANADA
1
4
EGYPT
4
You can write code like this:
|N_NATIONKEY|N_NAME|N_REGIONKEY|
|---:|:---|---:|
|0|ALGERIA|0|
|1|ARGENTINA|1|
|2|BRAZIL|1|
|3|CANADA|1|
|4|EGYPT|4|
It would save a lot of time to generate the Stackoverflow table markdown automatically when running Snowflake queries.
The following stored procedure accepts either a query string or a query ID (it will auto-detect which it is) and returns the table results as Stackoverflow table markdown. It will automatically align numbers and dates to the right, strings, arrays, and objects to the left, and other types default to centered. It supports any query you can pass to it. It may be a good idea to use $$ to terminate the string passed into the procedure in case the SQL contains single quotes. You can create the procedure and test it using this script:
create or replace procedure MARKDOWN("queryOrQueryId" string)
returns string
language javascript
execute as caller
as
$$
const MAX_ROWS = 50; // Set the maximum row count to fetch. Tables in markdown larger than this become hard to read.
var [rs, i, c, row, props] = [null, 0, 0, 0, {}];
if (!queryOrQueryId || queryOrQueryId == 0){
queryOrQueryId = `select * from table(result_scan(last_query_id())) limit ${MAX_ROWS}`;
}
queryOrQueryId = queryOrQueryId.trim();
if (isUUID(queryOrQueryId)){
rs = snowflake.execute({sqlText:`select * from table(result_scan('${queryOrQueryId}')) limit ${MAX_ROWS}`});
} else {
rs = snowflake.execute({sqlText:`${queryOrQueryId}`});
}
props.columnCount = rs.getColumnCount();
for(i = 1; i <= props.columnCount; i++){
props["col" + i + "Name"] = rs.getColumnName(i);
props["col" + i + "Type"] = rs.getColumnType(i);
}
var table = getHeader(props);
while(rs.next()){
row = "|";
for(c = 1; c <= props.columnCount; c++){
row += escapeMarkup(rs.getColumnValueAsString(c)) + "|";
}
table += "\n" + row;
}
return table;
//------ End main function. Start of helper functions.
function escapeMarkup(s){
s = s.replace(/\\/g, "\\\\");
s = s.replaceAll('|', '\\|');
s = s.replace(/\s+/g, " ");
return s;
}
function getHeader(props){
s = "|";
for (var i = 1; i <= props.columnCount; i++){
s += props["col" + i + "Name"] + "|";
}
s += "\n";
for (var i = 1; i <= props.columnCount; i++){
switch(props["col" + i + "Type"]) {
case 'number':
s += '|---:';
break;
case 'string':
s += '|:---';
break;
case 'date':
s += '|---:';
break;
case 'json':
s += '|:---';
break;
default:
s += '|:---:';
}
}
return s + "|";
}
function isUUID(str){
const regexExp = /^[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$/gi;
return regexExp.test(str);
}
$$;
-- Usage type 1, a simple query:
call stackoverflow_table($$ select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.NATION limit 5 $$);
-- Usage type 2, a query ID:
select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.NATION limit 5;
set quid = (select last_query_id());
call stackoverflow_table($quid);
Edit: Based on Fieldy's helpful feedback, I modified the procedure code to allow passing null or 0 or a blank string '' as the parameter. This will use the last query ID and is a helpful shortcut. It also adds a constant to the code that will limit the returns to a set number of rows. This limit will be applied when using query IDs (or sending null, '', or 0, which uses the last query ID). The limit is not applied when the input parameter is the text of a query to run to avoid syntax errors if there's already a limit applied, etc.
Greg Pavlik's Javascript Stored Procedure solution made me wonder if this would be any easier with the new Python language support in Stored Procedures. This is currently a public-preview feature.
The Python Snowpark API supports returning a result as a Pandas dataframe, and Pandas supports returning a dataframe in Markdown format, via the tabulate package. Here's the stored procedure.
CREATE OR REPLACE PROCEDURE markdown_table(query_id VARCHAR)
RETURNS VARCHAR
LANGUAGE PYTHON
RUNTIME_VERSION = '3.8'
PACKAGES = ('snowflake-snowpark-python','pandas','tabulate', 'regex')
HANDLER = 'markdown_table'
EXECUTE AS CALLER
AS $$
import pandas as pd
import tabulate
import regex
def markdown_table(session, queryOrQueryId = None):
# Validate UUID
if(queryOrQueryId is None):
pandas_result = session.sql("""Select * from table(result_scan(last_query_id()))""").to_pandas()
elif(bool(regex.match("^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", queryOrQueryId))):
pandas_result = session.sql(f"""select * from table(result_scan('{queryOrQueryId}'))""").to_pandas()
else:
pandas_result = session.sql(queryOrQueryId).to_pandas()
return pandas_result.to_markdown()
$$;
Which you can use as follows:
-- Usage type 1, use the result from the query ran immediately proceeding the Store-Procedure Call
select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.NATION limit 5;
call markdown_table(NULL);
-- Usage type 2, pass in a query_id
select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.NATION limit 5;
set quid = (select last_query_id());
select $quid;
call markdown_table($quid);
-- Usage type 3, provide a Query string to the Store-Procedure Call
call markdown_table('select * from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.NATION limit 5');
The table can also be
N_NATIONKEY|N_NAME|N_REGIONKEY
--|--|--
0|ALGERIA|0
1|ARGENTINA|1
2|BRAZIL|1
3|CANADA|1
4|EGYPT|4
giving, so it can be a simpler solution
N_NATIONKEY
N_NAME
N_REGIONKEY
0
ALGERIA
0
1
ARGENTINA
1
2
BRAZIL
1
3
CANADA
1
4
EGYPT
4
I grab the result table and use notepad++ and replace tab \t with pipe space | and then insert by hand the header marker line. I sometime replace the empty null results with the text null to make the results make more sense. the form you use with the start/end pipes gets around the need for that.
DBeaver IDE supports "data export as markdown" and "advanced copy as markdown" out-of-the-box:
Output:
|R_REGIONKEY|R_NAME|R_COMMENT|
|-----------|------|---------|
|0|AFRICA|lar deposits. blithely final packages cajole. regular waters are final requests. regular accounts are according to |
|1|AMERICA|hs use ironic, even requests. s|
|2|ASIA|ges. thinly even pinto beans ca|
|3|EUROPE|ly final courts cajole furiously final excuse|
|4|MIDDLE EAST|uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl|
It is rendered as:
R_REGIONKEY
R_NAME
R_COMMENT
0
AFRICA
lar deposits. blithely final packages cajole. regular waters are final requests. regular accounts are according to
1
AMERICA
hs use ironic, even requests. s
2
ASIA
ges. thinly even pinto beans ca
3
EUROPE
ly final courts cajole furiously final excuse
4
MIDDLE EAST
uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl
The below code tries to connect to a mssql database using pymssql. I have a CSV file and I am trying to push all the rows into a single data table in the mssql database. I am getting a 'KeyError' when I try to execute the code after opening the CSV file.
import csv
import pymssql
conn = pymssql.connect(host="host name",
database="dbname",
user = "username",
password = "password")
cursor = conn.cursor()
if(conn):
print("True")
else:
print("False")
with open ('path to csv file', 'r') as f:
reader = csv.reader(f)
columns = next(reader)
query = "INSERT INTO Marketing({'URL', 'Domain_name', 'Downloadables', 'Text_without_javascript', 'Downloadable_Link'}) VALUES ({%s,%s,%s,%s,%s})"
query = query.format(','.join('[' + x + ']' for x in columns), ','.join('?' * len(columns)))
cursor = conn.cursor()
for data in reader:
cursor.execute(query, tuple(data))
cursor.commit()
The below is the error that I get:
KeyError: "'URL', 'Domain_name', 'Downloadables', 'Text_without_javascript', 'Downloadable_Link'"
Using to_sql
file_path = "path to csv"
engine = create_engine("mssql://user:password#host/database")
df = pd.read_csv(file_path, encoding = 'latin')
df.to_sql(name='Marketing',con=engine,if_exists='append')
Output:
InterfaceError: (pyodbc.InterfaceError) ('IM002', '[IM002] [Microsoft][ODBC Driver Manager] Data source name not found and no default driver specified (0) (SQLDriverConnect)')
I tried everything, from converting the parameters which were being passed to a tuple, passing it as is, but didn't help. Below is the code that helped me fix the issue:
with open ('path to csv file', 'r') as f:
for row in f:
reader = csv.reader(f)
# print(reader)
columns = next(reader)
# print(columns)
cursor = conn.cursor()
for data in reader:
# print(data)
data = tuple(data)
# print(data)
query = ("INSERT INTO Marketing(URL, Domain_name, Downloadables, Text_without_javascript, Downloadable_Link) VALUES (%s,%s,%s,%s,%s)")
parameters = data
# query = query.format(','.join('?' * len(columns)))
cursor.execute(query, parameters)
conn.commit()
Note: The connecting to the database part remains as in the question.
I was tasked with developing a tool that will accept a few parameters and then query 2 databases based on a list of tables.
There are 3 possible database options, a connection to Netezza, a connection to Oracle, or a connection to a DB2 Mainframe. In theory they will pass me the type of connection, hostname, port, database name, username, and password.
The query will take a table from the list, query both databases and compare the data in the table across the 2 DBs.
For the connection to Netezza i am using pyodbc, for the connection to Oracle i am using cx_oracle, and for the connection to DB2 i am using ibm_db.
At the moment i was able to make a connection to each and i was able to return the column metadata of the table in each db as well as a result set from each.
There are a few things i am trying to accomplish.
If the column is of a certain data type (i.e. decimal, integer) i want to sum all the values for that column in the table, if it is of any other datatype (i.e. string, date) i want to count do a count().
I would like to do this for the table in both DBs and then do a comparison of the column counts/totals and display the comparison in excel.
Finally i would like to do a column by column comparison of every row in the table in both DBs. If there are any differences in the field values for each row then the entire row will be displayed in an excel spreadsheet.
What i am wondering is if there are any packages in python that i can use to perform these table like operations.
Please see the code below for what i have so far.
import pyodbc
import ibm_db
import cx_Oracle
import collections
class DatabaseConnection(object):
def __init__(self, connection_type, hostname_or_ip, port, database_or_sid, username, password):
self.port = port
self.connection_type = connection_type
self.hostname_or_ip = hostname_or_ip
self.database_or_sid = database_or_sid
self.username = username
self.password = password
self.dsn = "GEMPROD"
self.connection_string = ""
self.conn = ""
def __enter__(self):
if self.connection_type == "Netezza":
self.connection_string = "DRIVER={NetezzaSQL};SERVER=" + self.hostname_or_ip + ";PORT="+ self.port + \
";DATABASE=" + self.database_or_sid + ";UID=" + self.username + ";PWD=" + self.password
self.conn = pyodbc.connect(self.connection_string)
return self.conn
elif self.connection_type == "Oracle":
dsn_tns = cx_Oracle.makedsn(self.hostname_or_ip, self.port, self.database_or_sid)
self.conn = cx_Oracle.connect(user=self.username, password=self.password, dsn=dsn_tns)
return self.conn
elif self.connection_type == "DB2":
self.connection_string = "Database=" + self.database_or_sid + ";HOSTNAME=" + self.hostname_or_ip + \
";PORT=" + self.port + ";PROTOCOL=TCPIP;UID=" + self.username + ";PWD=" + \
self.password + ";"
#self.conn = ibm_db.connect(self.connection_string, "", "")
self.conn = ibm_db.connect('DSN=' + self.dsn, self.username, self.password)
return self.conn
pass
def __exit__(self, type, value, traceback):
if self.connection_type == "Netezza":
self.conn.close()
elif self.connection_type == "DB2":
ibm_db.close(self.conn)
elif self.connection_type == "Oracle":
self.conn.close
pass
def __repr__(self):
return '%s%s' % (self.__class__.__name__, self.dsn)
def query(self, query, params):
pass
#database_column_metadata = collections.namedtuple('DatabaseColumnMetadata','index column_name data_type')
#database_field = collections.namedtuple('', '')
table_list = ['BNR_CIF_25DAY_RPT', table2]
sort_column = None
with DatabaseConnection('Netezza', ip, port, database, username, pwd) as connection_one:
print('Netezza Query:')
for table in table_list:
cursor = connection_one.cursor()
netezza_rows = cursor.execute("SELECT * FROM BNR_CIF_25DAY_RPT LIMIT 1")
column_list = netezza_rows.description
sort_column = str(column_list[0][0])
netezza_query = "SELECT * FROM BNR_CIF_25DAY_RPT ORDER BY " + sort_column + " ASC LIMIT 10"
netezza_rows = cursor.execute(netezza_query)
print(column_list)
netezza_column_list = []
for idx, column in enumerate(column_list):
column_name, data_type, *rest = column
netezza_column_list.append((idx, column_name, data_type))
for row in netezza_rows:
print(row, end='\n')
for tup in netezza_column_list:
print(tup, end='\n')
print('Netezza row count:', str(netezza_rows.rowcount) + '\n')
cursor.close()
with DatabaseConnection('Oracle', hostname, port, SID, username, pwd) as connection_two:
print('Oracle Query:')
for table in table_list:
try:
cursor = connection_two.cursor()
oracle_rows = cursor.execute("SELECT * FROM BNR_CIF_25DAY_RPT WHERE ROWNUM <= 1")
column_list = oracle_rows.description
sort_column = column_list[0][0]
oracle_query = "SELECT * FROM (SELECT * FROM BNR_CIF_25DAY_RPT ORDER BY " + sort_column + " ASC) WHERE ROWNUM <=10"
oracle_rows = cursor.execute(oracle_query)
print(column_list)
oracle_column_list = []
for idx, column in enumerate(column_list):
column_name, data_type, *rest = column
oracle_column_list.append((idx, column_name, data_type))
for row in oracle_rows:
print(row, end='\n')
for tup in oracle_column_list:
print(tup, end='\n')
print('Oracle row count:', str(oracle_rows.rowcount) + '\n')
except cx_Oracle.DatabaseError as e:
print(str(e))
finally:
cursor.close()
Apologize for anything that didnt make sense and the poor code as i am new to Python and program is still in it's infancy.
This is not exactly python based solution but we used to do it in our shop to compare netezza and Oracle using fluid query . Fluid query
Quite new to coding, I am stuck with this code.It prints out the counts in the command terminal but doesn't create the database file in SQLite. The sqlite version I am using is 3.10.1
import sqlite3
conn = sqlite3.connect('mydb.sqlite')
cur = conn.cursor()
cur.execute('''
DROP TABLE IF EXISTS Counts''')
cur.execute('''CREATE TABLE Counts (org TEXT, count INTEGER)''')
fname = raw_input('Enter file name: ')
if ( len(fname) < 1 ) : fname = 'abc.txt'
fh = open(fname)
for line in fh:
if not (line.startswith('From: ')) : continue
pieces = line.split()
org = pieces[1].split('#')[1]
print org
cur.execute('''SELECT count FROM Counts WHERE org = ? ''', (org, ))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (org, count) VALUES ( ?, 1 )''', ( org, ) )
else :
cur.execute('''UPDATE Counts SET count=count+1 WHERE org = ?''',
(org, ))
# This statement commits outstanding changes to disk each
# time through the loop - the program can be made faster
# by moving the commit so it runs only after the loop completes
conn.commit()
sqlstr = '''SELECT org, count FROM Counts ORDER BY count DESC LIMIT 10'''
print "Counts:"
for row in cur.execute(sqlstr) :
print str(row[0]), row[1]
cur.close()
I have a large SQLite database (~134 GB) that has multiple tables each with 14 columns, about 330 million records, and 4 indexes. The only operation used on the database is "Select *" as I need all the columns(No inserts or updates). When I query the database, the response time is slow when the result set is big (takes 160 seconds for getting ~18,000 records).
I have improved the use of indexes multiple times and this is the fastest response time I got.
I am running the database as a back-end database for a web application on a server with 32 GB of RAM.
is there a way to use RAM (or anything else) to speed up the query process?
Here is the code that performs the query.
async.each(proteins,function(item, callback) {
`PI[item] = []; // Stores interaction proteins for all query proteins
PS[item] = []; // Stores scores for all interaction proteins
PIS[item] = []; // Stores interaction sites for all interaction proteins
var sites = {}; // a temporarily holder for interaction sites
var query_string = 'SELECT * FROM ' + organism + PIPE_output_table +
' WHERE ' + score_type + ' > ' + cutoff['range'] + ' AND (protein_A = "' + item + '" OR protein_B = "' + item '") ORDER BY PIPE_score DESC';
db.each(query_string, function (err, row) {
if (row.protein_A == item) {
PI[item].push(row.protein_B);
// add 1 to interaction sites to represent sites starting from 1 not from 0
sites['S1AS'] = row.site1_A_start + 1;
sites['S1AE'] = row.site1_A_end + 1;
sites['S1BS'] = row.site1_B_start + 1;
sites['S1BE'] = row.site1_B_end + 1;
sites['S2AS'] = row.site2_A_start + 1;
sites['S2AE'] = row.site2_A_end + 1;
sites['S2BS'] = row.site2_B_start + 1;
sites['S2BE'] = row.site2_B_end + 1;
sites['S3AS'] = row.site3_A_start + 1;
sites['S3AE'] = row.site3_A_end + 1;
sites['S3BS'] = row.site3_B_start + 1;
sites['S3BE'] = row.site3_B_end + 1;
PIS[item].push(sites);
sites = {};
}
}
The query you posted uses no variables.
It will always return the same thing: all the rows with a null score whose protein column is equal to its protein_a or protein_b column. You're then having to filter all those extra rows in Javascript, fetching a lot more rows than you need to.
Here's why...
If I'm understanding this query correctly, you have WHERE Score > [Score]. I've never encountered this syntax before, so I looked it up.
[keyword] A keyword enclosed in square brackets is an identifier. This is not standard SQL. This quoting mechanism is used by MS Access and SQL Server and is included in SQLite for compatibility.
An identifier is something like a column or table name, not a variable.
This means that this...
SELECT * FROM [TABLE]
WHERE Score > [Score] AND
(protein_A = [Protein] OR protein_B = [Protein])
ORDER BY [Score] DESC;
Is the same as this...
SELECT * FROM `TABLE`
WHERE Score > Score AND
(protein_A = Protein OR protein_B = Protein)
ORDER BY Score DESC;
You never pass any variables to the query. It will always return the same thing.
This can be seen here when you run it.
db.each(query_string, function (err, row) {
Since you're checking that each protein is equal to itself (or something very like itself), you're likely fetching every row. And it's why you have to filter all the rows again. And that is one of the reasons why your query is so slow.
if (row.protein_A == item) {
BUT! WHERE Score > [Score] will never be true, a thing cannot be greater than itself except for null! Trinary logic is weird. So only if Score is null can that be true.
So you're returning all the rows whose score is null and the protein column is equal to protein_a or protein_b. This is a lot more rows than you need, I guess you have a lot of rows with null scores.
Your query should incorporate variables (I'm assuming you're using node-sqlite3) and pass in their values when you execute the query.
var query = " \
SELECT * FROM `TABLE` \
WHERE Score > $score AND \
(protein_A = $protein OR protein_B = $protein) \
ORDER BY Score DESC; \
";
var stmt = db.prepare(query);
stmt.each({$score: score, $protein: protein}, function (err, row) {
PI[item].push(row.protein_B);
...
});