Pandas, how to reset? - Shape of passed values is (1,1), indices imply (3,1) - arrays

I'm currently writing some code and am using pandas to export all of the data into csv files. My program runs multiple iterations until it has gone through all of the necessary files. Pandas is re-writing one file each iteration but when it moves onto the next file I need it to reset all of the data (I think).
Structure is roughly:
While loop>a few variables are named>program runs>dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
This part works with no problem for one file. When moving onto the next file, all of the arrays I use are reset and this I think is why pandas gives the error Shape of passed values is (1,1), indices imply (3,1).
Please let me know if I need to explain it better.
EDIT:
While True:
try:
averagepercentagelist=[]
namelist=[]
columns=[]
for row in database:
averagepercentagelist=["12","23"]
namelist=["Name0","Name1"]
columns=["Average percentage"]
dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
except Exception as e:
print e
break
SNIPPET:
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
FULL PROGRAM SO FAR:
import csv
import os
import re
import pandas
import tkinter as tk
from tkinter import messagebox
from os.path import isfile, join
from os import listdir
import time
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
indexforcalcu=0
line_count=0
testlist=[]
namelist=[]
header=['Average Percentage']
def clearvariables():
indexforcalcu=0
testlist=[]
def findaverageofstudent(findaveragenumber,numoftests):
total=0
findaveragenumber = findaveragenumber/numoftests
findaveragenumber = round(findaveragenumber, 1)
return findaveragenumber
def removecharacters(nameforfunc):
nameforfunc=str(nameforfunc)
elem=re.sub("[{'}]", "",nameforfunc)
return elem
def getallclasses():
onlyfiles = [f for f in listdir(ROOT_PATH) if isfile(join(ROOT_PATH, f))]
onlyfiles.remove("averagecalculatorv2.py")
return onlyfiles
def findaveragefunc():
indexforcalcu=-1
while True:
try:
totaltests=0
line_count=0
averagepercentagelist=[]
indexforcalcu=indexforcalcu+1
allclasses=getallclasses()
currentcalculate=allclasses[indexforcalcu]
classpath = os.path.join(ROOT_PATH, currentcalculate)
with open(classpath) as csv_file:
classscoredb = csv.reader(csv_file, delimiter=',')
for i, row in enumerate(classscoredb):
if line_count == 0:
while True:
try:
totaltests=totaltests+1
rowreader= {row[totaltests]}
except:
totaltests=totaltests-1
line_count = line_count + 1
break
else:
calculating_column_location=1
total=0
while True:
try:
total = total + int(row[calculating_column_location])
calculating_column_location = calculating_column_location + 1
except:
break
i=str(i)
name=row[0]
cleanname=removecharacters(nameforfunc=name)
namelist.append(cleanname)
findaveragenumbercal=findaverageofstudent(findaveragenumber=total,numoftests=totaltests)
averagepercentagelist.append(findaveragenumbercal)
line_count = line_count + 1
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + i + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
i=int(i)
except Exception as e:
print("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n",e)
break
def makenewclass():
global newclassname
getclassname=str(newclassname.get())
if getclassname == "":
messagebox.showerror("Error","The class name you have entered is invalid.")
else:
classname = getclassname + ".csv"
with open(classname, mode='w') as employee_file:
classwriter = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
classwriter.writerow(["Name","Test 1"])
root=tk.Tk()
root.title("Test result average finder")
findaveragebutton=tk.Button(root,text="Find Averages",command=findaveragefunc())
findaveragebutton.grid(row=2,column=2,padx=(10, 10),pady=(0,10))
classnamelabel=tk.Label(root, text="Class name:")
classnamelabel.grid(row=1, column=0,padx=(10,0),pady=(10,10))
newclassname = tk.Entry(root)
newclassname.grid(row=1,column=1,padx=(10, 10))
newclassbutton=tk.Button(root,text="Create new class",command=makenewclass)
newclassbutton.grid(row=1,column=2,padx=(0, 10),pady=(10,10))
root.mainloop()
Thanks in advance,
Sean

Use:
import glob, os
import pandas as pd
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
#extract all csv files to list
files = glob.glob(f'{ROOT_PATH}/*.csv')
print (files)
#create new folder if necessary
new = os.path.join(ROOT_PATH,'Averages')
if not os.path.exists(new):
os.makedirs(new)
#loop each file
for f in files:
#create DataFrame and convert first column to index
df = pd.read_csv(f, index_col=[0])
#count average in each row, rond and create one colum DataFrame
avg = df.mean(axis=1).round(1).to_frame('Average Percentage')
#remove index name if nncessary
avg.index.name = None
print (avg)
#create new path
head, tail = os.path.split(f)
path = os.path.join(head, 'Averages', tail)
print (path)
#write DataFrame to csv
avg.to_csv(path)

Related

Using glob to import txt files to an array for interpolation

Currently I am using data (wavelength, flux) in txt format and have six txt files. The wavelengths are the same but the fluxes are different. I have imported the txt files using pd.read_cvs (as can be seen in the code) and assigned each flux a different name. These different named fluxes are placed in an array. Finally, I interpolate the fluxes with a temperature array. The codes works and because currently I only have six files writing the code this way is ok. The problem I have moving forward is that when I have 100s of txt files I need a better method.
How can I use glob to import the txt files, assign a different name to each flux (if that is necessary) and finally interpolate? Any help would be appreciated. Thank you.
import pandas as pd
import numpy as np
from scipy import interpolate
fcf = 0.0000001 # flux conversion factor
wcf = 10 #wave conversion factor
temperature = np.array([725,750,775,800,825,850])
# import files and assign column headers; blank to ignore spaces
c1p = pd.read_csv("../c/725.txt",sep=" ",header=None)
c1p.columns = ["blank","0","blank","blank","1"]
c2p = pd.read_csv("../c/750.txt",sep=" ",header=None)
c2p.columns = ["blank","0","blank","blank","1"]
c3p = pd.read_csv("../c/775.txt",sep=" ",header=None)
c3p.columns = ["blank","0","blank","blank","1"]
c4p = pd.read_csv("../c/800.txt",sep=" ",header=None)
c4p.columns = ["blank","0","blank","blank","1"]
c5p = pd.read_csv("../c/825.txt",sep=" ",header=None)
c5p.columns = ["blank","0","blank","blank","1"]
c6p = pd.read_csv("../c/850.txt",sep=" ",header=None)
c6p.columns = ["blank","0","blank","blank","1"]
wave = np.array(c1p['0']/wcf)
c1fp = np.array(c1p['1']*fcf)
c2fp = np.array(c2p['1']*fcf)
c3fp = np.array(c3p['1']*fcf)
c4fp = np.array(c4p['1']*fcf)
c5fp = np.array(c5p['1']*fcf)
c6fp = np.array(c6p['1']*fcf)
cfp = np.array([c1fp,c2fp,c3fp,c4fp,c5fp,c6fp])
flux_int = interpolate.interp1d(temperature,cfp,axis=0,kind='linear',bounds_error=False,fill_value='extrapolate')
My attempts so far...I think I have loaded the files into a list using glob as
import pandas as pd
import numpy as np
from scipy import interpolate
import glob
c_list=[]
path = "../c/*.*"
for file in glob.glob(path):
print(file)
c = pd.read_csv(file,sep=" ",header=None)
c.columns = ["blank","0","blank","blank","1"]
c_list.append
I am still unsure how to extract just the fluxes into an array in order to interpolate. I will continue to post my attempts.
My updated code
fcf = 0.0000001
import pandas as pd
import numpy as np
from scipy import interpolate
import glob
c_list=[]
path = "../c/*.*"
for file in glob.glob(path):
print(file)
c = pd.read_csv(file,sep=" ",header=None)
c.columns = ["blank","0","blank","blank","1"]
c = c['1']*fcf
c_list.append(c)
fluxes = np.array(c_list)
temperature = np.array([7250,7500,7750,8000,8250,8500])
flux_int =interpolate.interp1d(temperature,fluxes,axis=0,kind='linear',bounds_error=False,fill_value='extrapolate')
When I run this code I get the following error
raise ValueError("x and y arrays must be equal in length along "
ValueError: x and y arrays must be equal in length along interpolation axis.
I think the error in the code that needs correcting is here fluxes = np.array(c_list). This is one list of all fluxes but I need a list of fluxes from each file. How is this done?
Final attempt
import pandas as pd
import numpy as np
from scipy import interpolate
import glob
c_list=[]
path = "../c/*.*"
for file in glob.glob(path):
print(file)
c = pd.read_csv(file,sep=" ",header=None)
c.columns = ["blank","0","blank","blank","1"]
c = c['1']* 0.0000001
c_list.append(c)
c1=np.array(c_list[0])
c2=np.array(c_list[1])
c3=np.array(c_list[2])
c4=np.array(c_list[3])
c5=np.array(c_list[4])
c6=np.array(c_list[5])
fluxes = np.array([c1,c2,c3,c4,c5,c6])
temperature = np.array([7250,7500,7750,8000,8250,8500])
flux_int = interpolate.interp1d(temperature,fluxes,axis=0,kind='linear',bounds_error=False,fill_value='extrapolate')
This code work but I am still not sure about
c1=np.array(c_list[0])
c2=np.array(c_list[1])
c3=np.array(c_list[2])
c4=np.array(c_list[3])
c5=np.array(c_list[4])
c6=np.array(c_list[5])
Is there a better way to write this?
Here's 2 things that you can tdo:
Instead of
c = c['1']* 0.0000001
try doing c = c['1'].to_numpy()* 0.0000001
This will build a list of numpy Arrays rather than a list of pandas Series
When constructing fluxes, you can just do
fluxes = np.array(c_list)

Problem with saving pickle object into arrays from images in python

I have the following class for loading and converting my images into train and test arrays for a deep learning model in Tensorflow 2.
The images are in three folders, named 'Car', 'Cat' and 'Man', which are within the Train and Test main folders. Each image is of 300 x 400 pixels.
import os
import pickle
import cv2
import numpy as np
os.getcwd()
out: 'C:\\Users\\me\\Jupiter_Notebooks\\Dataset_Thermal\\SeekThermal'
path_train = "../SeekThermal/Train"
path_test = "../SeekThermal/Test"
class MasterImage(object):
def __init__(self,PATH='', IMAGE_SIZE = 50):
self.PATH = PATH
self.IMAGE_SIZE = IMAGE_SIZE
self.image_data = []
self.x_data = []
self.y_data = []
self.CATEGORIES = []
# This will get List of categories
self.list_categories = []
def get_categories(self):
for path in os.listdir(self.PATH):
if '.DS_Store' in path:
pass
else:
self.list_categories.append(path)
print("Found Categories ",self.list_categories,'\n')
return self.list_categories
def process_image(self):
try:
"""
Return Numpy array of image
:return: X_Data, Y_Data
"""
self.CATEGORIES = self.get_categories()
for categories in self.CATEGORIES: # Iterate over categories
train_folder_path = os.path.join(self.PATH, categories) # Folder Path
class_index = self.CATEGORIES.index(categories) # this will get index for classification
for img in os.listdir(train_folder_path): # This will iterate in the Folder
new_path = os.path.join(train_folder_path, img) # image Path
try: # if any image is corrupted
image_data_temp = cv2.imread(new_path) # Read Image as numbers
image_temp_resize = cv2.resize(image_data_temp,(self.IMAGE_SIZE,self.IMAGE_SIZE))
self.image_data.append([image_temp_resize,class_index])
random.shuffle(self.image_data)
except:
pass
data = np.asanyarray(self.image_data) # or, data = np.asanyarray(self.image_data,dtype=object)
# Iterate over the Data
for x in data:
self.x_data.append(x[0]) # Get the X_Data
self.y_data.append(x[1]) # get the label
X_Data = np.asarray(self.x_data) / (255.0) # Normalize Data
Y_Data = np.asarray(self.y_data)
# reshape x_Data
X_Data = X_Data.reshape(-1, self.IMAGE_SIZE, self.IMAGE_SIZE, 3)
return X_Data, Y_Data
except:
print("Failed to run Function Process Image ")
def pickle_image(self):
"""
:return: None Creates a Pickle Object of DataSet
"""
# Call the Function and Get the Data
X_Data,Y_Data = self.process_image()
# Write the Entire Data into a Pickle File
pickle_out = open('X_Data','wb')
pickle.dump(X_Data, pickle_out)
pickle_out.close()
# Write the Y Label Data
pickle_out = open('Y_Data', 'wb')
pickle.dump(Y_Data, pickle_out)
pickle_out.close()
print("Pickled Image Successfully ")
return X_Data,Y_Data
def load_dataset(self):
try:
# Read the Data from Pickle Object
X_Temp = open('..\SeekThermal\X_Data','rb')
X_Data = pickle.load(X_Temp)
Y_Temp = open('..\SeekThermal\Y_Data','rb')
Y_Data = pickle.load(Y_Temp)
print('Reading Dataset from Pickle Object')
return X_Data,Y_Data
except:
print('Could not Found Pickle File ')
print('Loading File and Dataset ..........')
X_Data,Y_Data = self.pickle_image()
return X_Data,Y_Data
I dont understand what the problem is with the pickle file, because just last week I able to create these arrays successfully with the same code??
Is there an easier way to load images in Tensorflow rather than through the custom defined class?
a = MasterImage(PATH = path_train,IMAGE_SIZE = 224)
a.process_image()
out:
it produces an array with a warning.
VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
data = np.asanyarray(self.image_data)
a.pickle_image()
out:
TypeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1692\507657192.py in <cell line: 1>()
----> 1 a.pickle_image()
~\AppData\Local\Temp\ipykernel_1692\1410849712.py in pickle_image(self)
71 """
72 # Call the Function and Get the Data
---> 73 X_Data,Y_Data = self.process_image()
74
75 # Write the Entire Data into a Pickle File
TypeError: cannot unpack non-iterable NoneType object
a.load_dataset()
out:
Could not Found Pickle File
Loading File and Dataset ..........
Found Categories ['Car', 'Cat', 'Man', 'Car', 'Cat', 'Man']
Pickled Image Successfully
I'm running Python 3.8.8 via anaconda on Windows 10. Thank you for any advice.

Writing array with pywin32 to excel

I am trying to write an array into Excel. The code snippet is below.
import win32com.client as win32
import sys
import numpy as np
#--------------
if name=="main":
name="nlve9.xlsm"
excel=win32.Dispatch('Excel.Application')
sheet=excel.Workbooks(name).Worksheets('NLVE')
#--------------
testress=np.zeros(5000)
for i in range(0,5000):
testress[i]=float(i)
sheet.Range("AC18:AC5016").value=testress
#excel.ScreenUpdating = False
#for i in range(18,5017):
# sheet.Range("AC" + str(i)).value=testress[i-18]
#excel.ScreenUpdating = True
sys.exit()
When this runs, I get a column of zero the length of testress. When I replace the last line with below it works but it is excruciatingly slow. THs is part of an optimization problem so this will run hundreds of times. Hence, I need this to be fast.
for i in range(18,5017):
# sheet.Range("AC" + str(i)).value=testress[i-18]
What am I doing wrong with the first method(sheet.Range("AC18:AC5016").value=testress)?
If you are using the Range.Value property to set an array, Excel needs a 2D array of row & column values, even if your data is a 1D array.
[[r1c1,r1c2,...],[r2c1,r2c2,...] ...]
As an example:
import win32com.client as wc
xl = wc.gencache.EnsureDispatch('Excel.Application')
xl.Visible = True
wb = xl.Workbooks.Add()
sh = wb.Sheets[1]
sh.Range('A1:A10').Value = [[i] for i in range(10)]
yielding:
EDIT:
From the OP's code, change:
testress=np.zeros(5000)
for i in range(0,5000):
testress[i]=float(i)
sheet.Range("AC18:AC5016").value=testress
to:
rowCount = 5000
testress = [[i] for i in range(rowCount)]
sheet.Range('AC18:AC'+str(18+rowCount-1)).Value = testress

Incrementing over a URL variable

import urllib2
import pandas as pd
from bs4 import BeautifulSoup
x = 0
i = 1
data = []
while (i < 13):
soup = BeautifulSoup(urllib2.urlopen(
'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex=' % i, +str(x)).read(), 'html')
tableStats = soup.find("table", ("class", "playerTableTable tableBody"))
for row in tableStats.findAll('tr')[2:]:
col = row.findAll('td')
try:
name = col[0].a.string.strip()
opp = col[1].a.string.strip()
rec = col[10].string.strip()
yds = col[11].string.strip()
dt = col[12].string.strip()
pts = col[13].string.strip()
data.append([name, opp, rec, yds, dt, pts])
except Exception as e:
pass
df = pd.DataFrame(data=data, columns=[
'PLAYER', 'OPP', 'REC', 'YDS', 'TD', 'PTS'])
df
i += 1
I have been working with a fantasy football program and I am trying to increment data over all weeks so I can create a dataframe for the top 40 players for each week.
I have been able to get it for any week of my choice by manually entering the week number in the PeriodId part of the url, but I am trying to programmatically increment it over each week to make it easier. I have tried using PeriodId='+ I +' and PeriodId=%d but I keep getting various errors about str and int concatenate and bad operands. Any suggestions or tips?
Try removing the comma between %i and str(x) to concatenate the strings and see if that helps.
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i, +str(x)).read(), 'html')
should be:
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i +str(x)).read(), 'html')
if you have problem concatenating or formatting URL please create variable instead write it one line with BeautifulSoup and urllib2.urlopen.
Use parenthesis to format with multiple value like "before %s is %s" % (1, 0)
url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=%s' % (i, x)
# or
#url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=0' % i
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
Make the code sorter will not effect the performance.

Wipe out dropout operations from TensorFlow graph

I have a trained freezed graph that I am trying to run on an ARM device. Basically, I am using contrib/pi_examples/label_image, but with my network instead of Inception. My network was trained with dropout, which now causes me troubles:
Invalid argument: No OpKernel was registered to support Op 'Switch' with these attrs. Registered kernels:
device='CPU'; T in [DT_FLOAT]
device='CPU'; T in [DT_INT32]
device='GPU'; T in [DT_STRING]
device='GPU'; T in [DT_BOOL]
device='GPU'; T in [DT_INT32]
device='GPU'; T in [DT_FLOAT]
[[Node: l_fc1_dropout/cond/Switch = Switch[T=DT_BOOL](is_training_pl, is_training_pl)]]
One solution I can see is to build such TF static library that includes the corresponding operation. From other hand, it might be a better idea to eliminate the dropout ops from the network in order to make it simpler and faster. Is there a way to do that?
Thanks.
#!/usr/bin/env python2
import argparse
import tensorflow as tf
from google.protobuf import text_format
from tensorflow.core.framework import graph_pb2
from tensorflow.core.framework import node_def_pb2
def print_graph(input_graph):
for node in input_graph.node:
print "{0} : {1} ( {2} )".format(node.name, node.op, node.input)
def strip(input_graph, drop_scope, input_before, output_after, pl_name):
input_nodes = input_graph.node
nodes_after_strip = []
for node in input_nodes:
print "{0} : {1} ( {2} )".format(node.name, node.op, node.input)
if node.name.startswith(drop_scope + '/'):
continue
if node.name == pl_name:
continue
new_node = node_def_pb2.NodeDef()
new_node.CopyFrom(node)
if new_node.name == output_after:
new_input = []
for node_name in new_node.input:
if node_name == drop_scope + '/cond/Merge':
new_input.append(input_before)
else:
new_input.append(node_name)
del new_node.input[:]
new_node.input.extend(new_input)
nodes_after_strip.append(new_node)
output_graph = graph_pb2.GraphDef()
output_graph.node.extend(nodes_after_strip)
return output_graph
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input-graph', action='store', dest='input_graph')
parser.add_argument('--input-binary', action='store_true', default=True, dest='input_binary')
parser.add_argument('--output-graph', action='store', dest='output_graph')
parser.add_argument('--output-binary', action='store_true', dest='output_binary', default=True)
args = parser.parse_args()
input_graph = args.input_graph
input_binary = args.input_binary
output_graph = args.output_graph
output_binary = args.output_binary
if not tf.gfile.Exists(input_graph):
print("Input graph file '" + input_graph + "' does not exist!")
return
input_graph_def = tf.GraphDef()
mode = "rb" if input_binary else "r"
with tf.gfile.FastGFile(input_graph, mode) as f:
if input_binary:
input_graph_def.ParseFromString(f.read())
else:
text_format.Merge(f.read().decode("utf-8"), input_graph_def)
print "Before:"
print_graph(input_graph_def)
output_graph_def = strip(input_graph_def, u'l_fc1_dropout', u'l_fc1/Relu', u'prediction/MatMul', u'is_training_pl')
print "After:"
print_graph(output_graph_def)
if output_binary:
with tf.gfile.GFile(output_graph, "wb") as f:
f.write(output_graph_def.SerializeToString())
else:
with tf.gfile.GFile(output_graph, "w") as f:
f.write(text_format.MessageToString(output_graph_def))
print("%d ops in the final graph." % len(output_graph_def.node))
if __name__ == "__main__":
main()
How about this as a more general solution:
for node in temp_graph_def.node:
for idx, i in enumerate(node.input):
input_clean = node_name_from_input(i)
if input_clean.endswith('/cond/Merge') and input_clean.split('/')[-3].startswith('dropout'):
identity = node_from_map(input_node_map, i).input[0]
assert identity.split('/')[-1] == 'Identity'
parent = node_from_map(input_node_map, node_from_map(input_node_map, identity).input[0])
pred_id = parent.input[1]
assert pred_id.split('/')[-1] == 'pred_id'
good = parent.input[0]
node.input[idx] = good

Resources