I have a huge dataset in SQL server, I want to Connect the SQL server with python, then use pyspark to run the query.
I've seen the JDBC driver but I don't find the way to do it, I did it with PYODBC but not with a spark.
Any help would be appreciated.
Please use the following to connect to Microsoft SQL:
def connect_to_sql(
spark, jdbc_hostname, jdbc_port, database, data_table, username, password
):
jdbc_url = "jdbc:sqlserver://{0}:{1}/{2}".format(jdbc_hostname, jdbc_port, database)
connection_details = {
"user": username,
"password": password,
"driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
}
df = spark.read.jdbc(url=jdbc_url, table=data_table, properties=connection_details)
return df
spark is a SparkSession object, and the rest are pretty clear.
You can also pass pushdown queries to read.jdbc
I use pissall's function (connect_to_sql) but I modified it a little.
from pyspark.sql import SparkSession
def connect_to_sql(
spark, jdbc_hostname, jdbc_port, database, data_table, username, password
):
jdbc_url = "jdbc:mysql://{0}:{1}/{2}".format(jdbc_hostname, jdbc_port, database)
connection_details = {
"user": username,
"password": password,
"driver": "com.mysql.jdbc.Driver",
}
df = spark.read.jdbc(url=jdbc_url, table=data_table, properties=connection_details)
return df
if __name__=='__main__':
spark = SparkSession \
.builder \
.appName('test') \
.master('local[*]') \
.enableHiveSupport() \
.config("spark.driver.extraClassPath", <path to mysql-connector-java-5.1.49-bin.jar>) \
.getOrCreate()
df = connect_to_sql(spark, 'localhost', <port>, <database_name>, <table_name>, <user>, <password>)
or you can use SparkSession .read method
df = spark.read.format("jdbc").option("url","jdbc:mysql://localhost/<database_name>").option("driver","com.mysql.jdbc.Driver").option("dbtable",<table_name>).option("user",<user>).option("password",<password>).load()
Related
I currently use pyodbc to read in as pandas dataframe, then I convert it to dask dataframe. Is there anyway to read in as Dask dataframe directly?
Here's the code I'm currently using:
import pandas as pd
import numpy as np
from dask.dataframe import from_pandas
def conn_sql_server(file_path):
#Connect to SQL Server
conn = pyodbc.connect('Driver= {SQL Server Native Client 11.0};'
'Server= Server1;'
'Database = Database1;'
'Trusted_Connection=yes;')
#run query and ouput the result to df
query = open(file_path, 'r')
df = pd.read_sql_query(query.read(), conn, chunksize=10**4)
chunks =[]
for chunk in df:
chunks.append(chunk)
df_comb = pd.concat(chunks)
query.close()
return df_comb
#load in as pandas dataframe
data = conn_sql_server('.\input\data pull.sql')
#Convert to dask dataframe
dd = from_pandas(data, npartitions=3)
I tried to use dd.read_sql_query with pyodbc package or sqlalchemy package. Both returned an AttributeError: module 'dask.dataframe' has no attribute 'read_sql_query'
(1) pyodbc:
import dask.dataframe as dd
def conn_sql_server(file_path):
#Connect to SQL Server
conn = pyodbc.connect('Driver= {SQL Server Native Client 11.0};'
'Server= Server1;'
'Database = Database1;'
'Trusted_Connection=yes;')
#run query and ouput the result to df
query = open(file_path, 'r')
df = dd.read_sql_query(query.read(), conn)
query.close()
return df
data = conn_sql_server('.\input\data pull.sql')
AttributeError: module 'dask.dataframe' has no attribute 'read_sql_query'
(2) sqlalchemy:
from sqlalchemy import create_engine
Server= 'Server1'
Database = 'Database1'
Driver= 'SQL Server Native Client 11.0'
uri = f'mssql://{Server}/{Database}?driver={Driver}'
query = open('.\input\data pull.sql', 'r')
dd.read_sql_query((query.read(), uri))
As Nick suggested, I've upgrade dask to the latest version using python -m pip install dask distributed --upgrade. I also checked all of the functions listed in the dask.dataframe module using the following script. Found out that there is only read_sql_table, no read_sql_query
from inspect import getmembers, isfunction
import dask.dataframe as dd
getmembers(dd, isfunction)
I'm trying to connect to a database in an on prem SQL Server but I'm getting this error that I'm not quite understanding. Apparently when I run this in data-bricks it can't find the driver I'm specifying.
Try it like this.
from pyspark.sql.session import SparkSession
spark = SparkSession \
.builder \
.config("spark.driver.extraClassPath","mssql-jdbc-6.4.0.jre8.jar") \
.appName("Python Spark Data Source Example") \
.getOrCreate()
dbDF = spark.read.format("jdbc") \
.option("url","jdbc:sqlserver://your_server_name.database.windows.net:1433;databaseName=your_database_name") \
.option("dbtable","dbo.ml_securitymastersample") \
.option("user","your_user_name") \
.option("nullValue", ["","N.A.","NULL"]) \
.option("password","your_password").load()
# display the dimensions of a Spark DataFrame
def shape(df):
print("Shape: ", df.count(), ",", len(df.columns))
# check % of missing values across all columns
dbDf.select([count(when(col(c).isNull), c)).alias(c) for c in dbDf.columns]).show()
Does that work for you?
We just switched away from Scala and moved over to Python. I've got a dataframe that I need to push into SQL Server. I did this multiple times before, using the Scala code below.
var bulkCopyMetadata = new BulkCopyMetadata
bulkCopyMetadata.addColumnMetadata(1, "Title", java.sql.Types.NVARCHAR, 128, 0)
bulkCopyMetadata.addColumnMetadata(2, "FirstName", java.sql.Types.NVARCHAR, 50, 0)
bulkCopyMetadata.addColumnMetadata(3, "LastName", java.sql.Types.NVARCHAR, 50, 0)
val bulkCopyConfig = Config(Map(
"url" -> "mysqlserver.database.windows.net",
"databaseName" -> "MyDatabase",
"user" -> "username",
"password" -> "*********",
"dbTable" -> "dbo.Clients",
"bulkCopyBatchSize" -> "2500",
"bulkCopyTableLock" -> "true",
"bulkCopyTimeout" -> "600"
))
df.bulkCopyToSqlDB(bulkCopyConfig, bulkCopyMetadata)
That's documented here.
https://learn.microsoft.com/en-us/azure/sql-database/sql-database-spark-connector
I'm looking for an equivalent Python script to do the same job. I searched for the same, but didn't come across anything. Does someone here have something that would do the job? Thanks.
Please try to refer to PySpark offical document JDBC To Other Databases to directly write a PySpark dataframe to SQL Server via the jdbc driver of MS SQL Server.
Here is the sample code.
spark_jdbcDF.write
.format("jdbc")
.option("url", "jdbc:sqlserver://yourserver.database.windows.net:1433")
.option("dbtable", "<your table name>")
.option("user", "username")
.option("password", "password")
.save()
Or
jdbcUrl = "jdbc:mysql://{0}:{1}/{2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
connectionProperties = {
"user" : jdbcUsername,
"password" : jdbcPassword,
"driver" : "com.mysql.jdbc.Driver"
}
spark_jdbcDF.write \
.jdbc(url=jdbcUrl, table="<your table anem>",
properties=connectionProperties ).save()
Hope it helps.
Here is the complete PySpark code to write a Spark Data Frame to an SQL Server database including where to input database name and schema name:
df.write \
.format("jdbc")\
.option("url", "jdbc:sqlserver://<servername>:1433;databaseName=<databasename>")\
.option("dbtable", "[<optional_schema_name>].<table_name>")\
.option("user", "<user_name>")\
.option("password", "<password>")\
.save()
I have a SQL Server on which I have databases that I want to use pandas to alter that data. I know how to get the data using pyodbc into a DataFrame, but then I have no clue how to get that DataFrame back into my SQL Server.
I have tried to create an engine with sqlalchemy and use the to_sql command, but I can not get that to work because my engine is never able to connect correctly to my database.
import pyodbc
import pandas
server = "server"
db = "db"
conn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes')
cursor = conn.cursor()
df = cursor.fetchall()
data = pandas.DataFrame(df)
conn.commit()
You can use pandas.DataFrame.to_sql to insert your dataframe into SQL server. Databases supported by SQLAlchemy are supported by this method.
Here is a example how you can achieve this:
from sqlalchemy import create_engine, event
from urllib.parse import quote_plus
import logging
import sys
import numpy as np
from datetime import datetime, timedelta
# setup logging
logging.basicConfig(stream=sys.stdout,
filemode='a',
format='%(asctime)s.%(msecs)3d %(levelname)s:%(name)s: %(message)s',
datefmt='%m-%d-%Y %H:%M:%S',
level=logging.DEBUG)
logger = logging.getLogger(__name__) # get the name of the module
def write_to_db(df, database_name, table_name):
"""
Creates a sqlalchemy engine and write the dataframe to database
"""
# replacing infinity by nan
df = df.replace([np.inf, -np.inf], np.nan)
user_name = 'USERNAME'
pwd = 'PASSWORD'
db_addr = '10.00.000.10'
chunk_size = 40
conn = "DRIVER={SQL Server};SERVER="+db_addr+";DATABASE="+database_name+";UID="+user_name+";PWD="+pwd+""
quoted = quote_plus(conn)
new_con = 'mssql+pyodbc:///?odbc_connect={}'.format(quoted)
# create sqlalchemy engine
engine = create_engine(new_con)
# Write to DB
logger.info("Writing to database ...")
st = datetime.now() # start time
# WARNING!! -- overwrites the table using if_exists='replace'
df.to_sql(table_name, engine, if_exists='replace', index=False, chunksize=chunk_size)
logger.info("Database updated...")
logger.info("Data written to '{}' databsae into '{}' table ...".format(database_name, table_name))
logger.info("Time taken to write to DB: {}".format((datetime.now()-st).total_seconds()))
Calling this method should write your dataframe to the database, note that it will replace the table if there is already a table in the database with the same name.
I am trying to write a Pandas Dataframe to a Amazon Redshift DB. Given below is the code I am using.
from sqlalchemy import create_engine
import psycopg2
import io
engine=create_engine('postgresql+psycopg2://username:password#host:port/database')
conn=engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
report.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, table_name, null="")
conn.commit()
However on running the above code, i get the below error
NameError: name 'database' is not defined
Could anyone help. Thanks..
Used the below code to copy Df to Amazon Redshift Table
from sqlalchemy import create_engine
conn = create_engine('postgresql://user:pass#host:port/datbase')
df.to_sql('table', conn, index = False, if_exists = 'replace')