Benchmarking retrieval in cloudant

Benchmarking retrieval in cloudant - database

I want to retrieve a set of keys from a cloudant db. I tried a few ways. However, custom_result performs lightning fast as compared to other methods. Can someone explain why?
from cloudant import cloudant
import json
import time
from cloudant.result import Result,ResultByKey
with open('credentials.json') as f:
cred = json.load(f)
with cloudant(str(cred['credentials']['username']),str(cred['credentials']['password']),url=str(cred['credentials']['url'])) as client:
my_database = client['my_database']
#Using POST
payload = {"keys": ["012","345"]}
end_point = '{0}/{1}'.format(client.server_url, 'my_database/_all_docs')
params = {'include_docs': 'true'}
start = time.time()
response = client.r_session.post(end_point, data=json.dumps(payload), params=params)
end = time.time()
print end-start
#using custom_result
start = time.time()
result = my_database.custom_result(include_docs=True,keys=["012","345"])
end = time.time()
print end-start
#using all_doc
start = time.time()
result = my_database.all_docs(include_docs=True,keys=["012","345"])
end = time.time()
print end-start
#using iteration
keys=["012","345"]
start = time.time()
result=[]
result_collection = Result(my_database.all_docs, include_docs=True)
for i in range(len(keys)):
result.append(result_collection[ResultByKey(i)])
end = time.time()
print end-start
My output is as follows:
0.426064968109
4.10079956055e-05
0.409541845322
0.819295167923
Can someone tell me why custom_result performs so fast?

Function database.custom_result() only creates the cloudant.result.Result instance, no results are returned from Cloudant. You can try test it this way, end-start will be the whole time you get response from Cloudant.
start = time.time()
with database.custom_result(include_docs=True) as result:
data = result[:]
end = time.time()
print end-start

Related

How to let the child process of one function finish and then run the second function?

Here I am simply calling a 3rd party API to get the prices of stocks through multiprocessing. I am using this function multiple times as I want the timeframe of stocks as (5 min, 10 min, 30 min). But when I run it, it does not wait for the previous functions to finish and instead move on to the last function to complete it. How to run each and every function in order ?
import pickle
import pandas as pd
import datetime
import multiprocessing
import time
import subprocess,os
def historical_data(timeframe):
global prices
def split_dict_equally(input_dict, chunks=2):
"Splits dict by keys. Returns a list of dictionaries."
# prep with empty dicts
return_list = [dict() for idx in range(chunks)]
idx = 0
for k,v in input_dict.items():
return_list[idx][k] = v
if idx < chunks-1: # indexes start at 0
idx += 1
else:
idx = 0
return return_list
with open('zerodha_login.pkl', 'rb') as file:
# Call load method to deserialze
login_credentials = pickle.load(file)
with open('zerodha_instruments.pkl', 'rb') as file:
# Call load method to deserialze
inst_dict = pickle.load(file)
csv = pd.read_csv('D:\\Business\\Website\\Trendlines\\FO Stocks.csv')
csv['Stocks'] = csv['Stocks'].str.replace(' ','')
fo_stocks = csv['Stocks'].to_list()
inst = pd.DataFrame(inst_dict)
filtered_inst = inst.copy()
filtered_inst = inst[(inst['segment'] == 'NSE') & (inst['name'] != '') & (inst['tick_size'] == 0.05) ]
filtered_inst = filtered_inst[filtered_inst['tradingsymbol'].isin(fo_stocks)]
tickers_dict = dict(zip(filtered_inst['instrument_token'],filtered_inst['tradingsymbol']))
tickers_dict = dict(zip(filtered_inst['instrument_token'],filtered_inst['tradingsymbol']))
number_process = 16
tickers_dict_list = split_dict_equally(tickers_dict,number_process)
def prices(stock):
print('inside_function',os.getpid())
for x,y in stock.items():
print('inside_stock_loop')
while True:
try:
print('Timeframe::',timeframe,y)
data = login_credentials['kite'].historical_data(instrument_token=x, from_date=today_date - datetime.timedelta(days=1000),interval=str(timeframe),to_date=today_date )
df = pd.DataFrame(data)
g = [e for e in df.columns if 'Un' not in e]
df = df[g]
df['date'] = df['date'].astype(str)
df['date'] = df['date'].str.split('+')
df['Date'] = df['date'].str[0]
df = df[['Date','open','high','low','close','volume']]
df['Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d %H:%M:%S')
df['Time'] = df['Date'].dt.time
df['Date'] = df['Date'].dt.date
df.rename(columns={'open':'Open','high':'High','low':'Low','close':'Close','volume':'Volume'},inplace=True)
df.to_csv('D:\\Business\\Website\\Trendlines\\4th Cut\\Historical data\\'+str(timeframe)+'\\'+str(y)+'.csv')
break
except:
print('Issue ::',y)
pass
new_list = []
if __name__ == '__main__':
for process in tickers_dict_list:
p = multiprocessing.Process(target=prices, args=(process,))
p.start()
new_list.append(p)
for p in new_list:
print('joining_',p)
p.join()
historical_data('5minute')
historical_data('10minute')

Custom Locust User for SageMaker Endpoint Keeps running after time limit is reached

I have been trying to build a SagemakerUser from the base User class in the Locust library. The issue though is when I use it with a timed shape test, when said test ends (you can see a message: Shape test stopping) the load test shrugs it off and continues. Below is the script I have written to this end. My question is how is this behaviour explained?
import pandas as pd
from locust import HttpUser, User, task, TaskSet, events, LoadTestShape
from sagemaker.serializers import JSONSerializer
from sagemaker.session import Session
import sagemaker
import time
import sys
import math
import pdb
df = "some df to load samples from"
endpoint = "sage maker end point name"
class SagemakerClient(sagemaker.predictor.Predictor):
def predictEx(self, data):
start_time = time.time()
start_perf_counter = time.perf_counter()
name = 'predictEx'
try:
result = self.predict(data)
except:
total_time = int((time.perf_counter() - start_perf_counter) * 1000)
events.request_failure.fire(request_type="sagemaker", name=name, response_time=total_time, exception=sys.exc_info(), response_length=0)
else:
total_time = int((time.perf_counter() - start_perf_counter) * 1000)
events.request_success.fire(request_type="sagemaker", name=name, response_time=total_time, response_length=sys.getsizeof(result))
class SagemakerLocust(User):
abstract = True
def __init__(self, *args, **kwargs):
super(SagemakerLocust, self).__init__(*args, **kwargs)
self.client = SagemakerClient(
sagemaker_session = Session(),
endpoint_name = "sagemaker-test",
serializer = JSONSerializer())
class APIUser(SagemakerLocust):
#task
def call(self):
request = df.text.sample(1, weights=df.length).iloc[0]
self.client.predictEx(request)
class StepLoadShape(LoadTestShape):
"""
A step load shape
Keyword arguments:
step_time -- Time between steps
step_load -- User increase amount at each step
spawn_rate -- Users to stop/start per second at every step
time_limit -- Time limit in seconds
"""
step_time = 30#3600
step_load = 1
spawn_rate = 1
time_limit =2#3600*6
#pdb.set_trace()
def tick(self):
run_time = self.get_run_time()
if run_time > self.time_limit:
return None
current_step = math.floor(run_time / self.step_time) + 1
return (current_step * self.step_load, self.spawn_rate)

How to use cursors in Google App Engine for fetching records from a db database in batches?

I am trying to fetch data from a db database in batches and copy it to an ndb database, using a cursor. My code is doing it successfully for the first batch, but not fetching any further records. I did not find much information on cursors, please help me here.
Here is my code snippet: def post(self):
a = 0
chunk_size = 2
next_cursor = self.request.get("cursor")
query = db.GqlQuery("select * from BooksPost")
while a == 0:
if next_cursor:
query.with_cursor(start_cursor = next_cursor)
else:
a = 1
results = query.fetch(chunk_size)
for result in results:
nbook1 = result.bookname
nauthor1 = result.authorname
nbook1 = nBooksPost(nbookname = nbook1, nauthorname = nauthor1)
nbook1.put()
next_cursor = self.request.get("cursor")
Basically, how do I set the next cursor to iterate over?

def post(self):
chunk_size = 10
has_more_results = True
query = db.GqlQuery("select * from Post")
cursor = self.request.get('cursor', None)
#cursor = query.cursor()
if cursor:
query.with_cursor(cursor)
while has_more_results == True:
results = query.fetch(chunk_size)
new_cursor = query.cursor()
print("count: %d, results %d" % (query.count(), len(results)))
if query.count(1) == 1:
has_more_results = True
else:
has_more_results = False
for result in results:
#do this
query.with_cursor(new_cursor)

Asyncio Run Loop Errors in Python 3.7

I am trying to use the asyncio packages to execute concurrent calls from one SQL Server to another in order to extract data. I'm hitting an issue of at the portion of myLoop.run_until_complete(cors) where it is telling me that the event loop is already running. I will admit that I am new to this package and may be overlooking something simple.
import pyodbc
import sqlalchemy
import pandas
import asyncio
import time
async def getEngine(startString):
sourceList = str.split(startString,'=')
server = str.split(sourceList[1],';')[0]
database = str.split(sourceList[2],';')[0]
user = str.split(sourceList[3],';')[0]
password = str.split(sourceList[4],';')[0]
returnEngine = sqlalchemy.create_engine("mssql+pyodbc://"+user+":"+password+"#"+server+"/"+database+"?driver=SQL+Server+Native+Client+11.0")
return returnEngine
async def getConnString(startString):
sourceList = str.split(startString,'=')
server = str.split(sourceList[1],';')[0]
database = str.split(sourceList[2],';')[0]
user = str.split(sourceList[3],';')[0]
password = str.split(sourceList[4],';')[0]
return "Driver={SQL Server Native Client 11.0};Server="+server+";Database="+database+";Uid="+user+";Pwd="+password+";"
async def executePackage(source,destination,query,sourceTable,destTable,lastmodifiedDate,basedOnStation):
sourceConnString = getConnString(source)
destEngine = getEngine(destination)
sourceConn = pyodbc.connect(sourceConnString)
newQuery = str.replace(query,'dateTest',str(lastmodifiedDate))
df = pandas.read_sql(newQuery,sourceConn)
print('Started '+sourceTable+'->'+destTable)
tic = time.perf_counter()
await df.to_sql(destTable,destEngine,index=False,if_exists="append")
toc = time.perf_counter()
secondsToFinish = toc - tic
print('Finished '+sourceTable+'->'+destTable+' in '+ str(secondsToFinish) +' seconds')
async def main():
connString = "Driver={SQL Server Native Client 11.0};Server=myServer;Trusted_Connection=yes;"
myConn = pyodbc.connect(connString)
cursor = myConn.cursor()
df = pandas.read_sql('exec mySql_stored_proc',myConn)
if len(df.index) > 0:
tasks = [executePackage(df.iloc[i,10],df.iloc[i,11],df.iloc[i,7],df.iloc[i,8],df.iloc[i,9],df.iloc[i,5],df.iloc[i,17])for i in range(len(df))]
myLoop = asyncio.get_event_loop()
cors = asyncio.wait(tasks)
myLoop.run_until_complete(cors)
if __name__ =="__main__":
asyncio.run(main())

Pandas, how to reset? - Shape of passed values is (1,1), indices imply (3,1)

I'm currently writing some code and am using pandas to export all of the data into csv files. My program runs multiple iterations until it has gone through all of the necessary files. Pandas is re-writing one file each iteration but when it moves onto the next file I need it to reset all of the data (I think).
Structure is roughly:
While loop>a few variables are named>program runs>dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
This part works with no problem for one file. When moving onto the next file, all of the arrays I use are reset and this I think is why pandas gives the error Shape of passed values is (1,1), indices imply (3,1).
Please let me know if I need to explain it better.
EDIT:
While True:
try:
averagepercentagelist=[]
namelist=[]
columns=[]
for row in database:
averagepercentagelist=["12","23"]
namelist=["Name0","Name1"]
columns=["Average percentage"]
dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
except Exception as e:
print e
break
SNIPPET:
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
FULL PROGRAM SO FAR:
import csv
import os
import re
import pandas
import tkinter as tk
from tkinter import messagebox
from os.path import isfile, join
from os import listdir
import time
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
indexforcalcu=0
line_count=0
testlist=[]
namelist=[]
header=['Average Percentage']
def clearvariables():
indexforcalcu=0
testlist=[]
def findaverageofstudent(findaveragenumber,numoftests):
total=0
findaveragenumber = findaveragenumber/numoftests
findaveragenumber = round(findaveragenumber, 1)
return findaveragenumber
def removecharacters(nameforfunc):
nameforfunc=str(nameforfunc)
elem=re.sub("[{'}]", "",nameforfunc)
return elem
def getallclasses():
onlyfiles = [f for f in listdir(ROOT_PATH) if isfile(join(ROOT_PATH, f))]
onlyfiles.remove("averagecalculatorv2.py")
return onlyfiles
def findaveragefunc():
indexforcalcu=-1
while True:
try:
totaltests=0
line_count=0
averagepercentagelist=[]
indexforcalcu=indexforcalcu+1
allclasses=getallclasses()
currentcalculate=allclasses[indexforcalcu]
classpath = os.path.join(ROOT_PATH, currentcalculate)
with open(classpath) as csv_file:
classscoredb = csv.reader(csv_file, delimiter=',')
for i, row in enumerate(classscoredb):
if line_count == 0:
while True:
try:
totaltests=totaltests+1
rowreader= {row[totaltests]}
except:
totaltests=totaltests-1
line_count = line_count + 1
break
else:
calculating_column_location=1
total=0
while True:
try:
total = total + int(row[calculating_column_location])
calculating_column_location = calculating_column_location + 1
except:
break
i=str(i)
name=row[0]
cleanname=removecharacters(nameforfunc=name)
namelist.append(cleanname)
findaveragenumbercal=findaverageofstudent(findaveragenumber=total,numoftests=totaltests)
averagepercentagelist.append(findaveragenumbercal)
line_count = line_count + 1
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + i + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
i=int(i)
except Exception as e:
print("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n",e)
break
def makenewclass():
global newclassname
getclassname=str(newclassname.get())
if getclassname == "":
messagebox.showerror("Error","The class name you have entered is invalid.")
else:
classname = getclassname + ".csv"
with open(classname, mode='w') as employee_file:
classwriter = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
classwriter.writerow(["Name","Test 1"])
root=tk.Tk()
root.title("Test result average finder")
findaveragebutton=tk.Button(root,text="Find Averages",command=findaveragefunc())
findaveragebutton.grid(row=2,column=2,padx=(10, 10),pady=(0,10))
classnamelabel=tk.Label(root, text="Class name:")
classnamelabel.grid(row=1, column=0,padx=(10,0),pady=(10,10))
newclassname = tk.Entry(root)
newclassname.grid(row=1,column=1,padx=(10, 10))
newclassbutton=tk.Button(root,text="Create new class",command=makenewclass)
newclassbutton.grid(row=1,column=2,padx=(0, 10),pady=(10,10))
root.mainloop()
Thanks in advance,
Sean

Use:
import glob, os
import pandas as pd
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
#extract all csv files to list
files = glob.glob(f'{ROOT_PATH}/*.csv')
print (files)
#create new folder if necessary
new = os.path.join(ROOT_PATH,'Averages')
if not os.path.exists(new):
os.makedirs(new)
#loop each file
for f in files:
#create DataFrame and convert first column to index
df = pd.read_csv(f, index_col=[0])
#count average in each row, rond and create one colum DataFrame
avg = df.mean(axis=1).round(1).to_frame('Average Percentage')
#remove index name if nncessary
avg.index.name = None
print (avg)
#create new path
head, tail = os.path.split(f)
path = os.path.join(head, 'Averages', tail)
print (path)
#write DataFrame to csv
avg.to_csv(path)