import urllib2
import pandas as pd
from bs4 import BeautifulSoup
x = 0
i = 1
data = []
while (i < 13):
soup = BeautifulSoup(urllib2.urlopen(
'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex=' % i, +str(x)).read(), 'html')
tableStats = soup.find("table", ("class", "playerTableTable tableBody"))
for row in tableStats.findAll('tr')[2:]:
col = row.findAll('td')
try:
name = col[0].a.string.strip()
opp = col[1].a.string.strip()
rec = col[10].string.strip()
yds = col[11].string.strip()
dt = col[12].string.strip()
pts = col[13].string.strip()
data.append([name, opp, rec, yds, dt, pts])
except Exception as e:
pass
df = pd.DataFrame(data=data, columns=[
'PLAYER', 'OPP', 'REC', 'YDS', 'TD', 'PTS'])
df
i += 1
I have been working with a fantasy football program and I am trying to increment data over all weeks so I can create a dataframe for the top 40 players for each week.
I have been able to get it for any week of my choice by manually entering the week number in the PeriodId part of the url, but I am trying to programmatically increment it over each week to make it easier. I have tried using PeriodId='+ I +' and PeriodId=%d but I keep getting various errors about str and int concatenate and bad operands. Any suggestions or tips?
Try removing the comma between %i and str(x) to concatenate the strings and see if that helps.
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i, +str(x)).read(), 'html')
should be:
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i +str(x)).read(), 'html')
if you have problem concatenating or formatting URL please create variable instead write it one line with BeautifulSoup and urllib2.urlopen.
Use parenthesis to format with multiple value like "before %s is %s" % (1, 0)
url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=%s' % (i, x)
# or
#url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=0' % i
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
Make the code sorter will not effect the performance.
Related
I am trying to identify Global Feature Relationships with SHAP values. The SHAP library returns three matrices and I am trying to select the SHAP matrix however, I am getting this error: "IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed".
The code I have is below:
df_score = spark.sql("select * from sandbox.yt_trng_churn_device")
#XGBoost Model
import pickle
from xgboost import XGBClassifier
from mlflow.tracking import MlflowClient
client = MlflowClient()
local_dir = "/dbfs/FileStore/"
local_path = client.download_artifacts
model_path = '/dbfs/FileStore/'
model = XGBClassifier()
model = pickle.load(open(model_path, 'rb'))
HorizonDate = datetime.datetime(2022, 9, 5)
df = df_score
score_data = df.toPandas()
results = model.predict_proba(score_data)
results_l = model.predict(score_data)
score_data["p"]=pd.Series( (v[1] for v in results) )
score_data["l"]=pd.Series( (v for v in results_l) )
spark.createDataFrame(score_data).createOrReplaceTempView("yt_vw_tmp_dev__scores")
spark.sql("create or replace table sandbox.yt_vw_tmp_dev__scores as select * from yt_vw_tmp_dev__scores")
#SHAP Analysis on XGBoost
from shap import KernelExplainer, summary_plot
sql = """
select d_a.*
from
hive_metastore.sandbox.yt_trng_device d_a
right join
(select decile, msisdn, MSISDN_L2L
from(
select ntile(10) over (order by p desc) as decile, msisdn, MSISDN_L2L
from sandbox.yt_vw_tmp_dev__scores
) inc
order by decile) d_b
on d_a.MSISDN_L2L = d_b.MSISDN_L2L and d_a.msisdn = d_b.msisdn
"""
df = spark.sql(sql).drop('msisdn', 'imei', 'imsi', 'event_date', 'MSISDN_L2L', 'account_id')
score_df = df.toPandas()
mode = score_df.mode().iloc[0]
sample = score_df.sample(n=min(100, score_df.shape[0]), random_state=508502835).fillna(mode)
predict = lambda x: model.predict(pd.DataFrame(x, columns=score_df.columns))
explainer = KernelExplainer(predict, sample, link="identity")
shap_values = explainer.shap_values(sample, l1_reg=False)
# The return of the explainer has three matrices, we will get the shap values one
shap_values = shap_values[ :, :, 0]
I am fairly new to coding but it would be great if someone could give some direction on this
I'm currently writing some code and am using pandas to export all of the data into csv files. My program runs multiple iterations until it has gone through all of the necessary files. Pandas is re-writing one file each iteration but when it moves onto the next file I need it to reset all of the data (I think).
Structure is roughly:
While loop>a few variables are named>program runs>dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
This part works with no problem for one file. When moving onto the next file, all of the arrays I use are reset and this I think is why pandas gives the error Shape of passed values is (1,1), indices imply (3,1).
Please let me know if I need to explain it better.
EDIT:
While True:
try:
averagepercentagelist=[]
namelist=[]
columns=[]
for row in database:
averagepercentagelist=["12","23"]
namelist=["Name0","Name1"]
columns=["Average percentage"]
dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
except Exception as e:
print e
break
SNIPPET:
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
FULL PROGRAM SO FAR:
import csv
import os
import re
import pandas
import tkinter as tk
from tkinter import messagebox
from os.path import isfile, join
from os import listdir
import time
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
indexforcalcu=0
line_count=0
testlist=[]
namelist=[]
header=['Average Percentage']
def clearvariables():
indexforcalcu=0
testlist=[]
def findaverageofstudent(findaveragenumber,numoftests):
total=0
findaveragenumber = findaveragenumber/numoftests
findaveragenumber = round(findaveragenumber, 1)
return findaveragenumber
def removecharacters(nameforfunc):
nameforfunc=str(nameforfunc)
elem=re.sub("[{'}]", "",nameforfunc)
return elem
def getallclasses():
onlyfiles = [f for f in listdir(ROOT_PATH) if isfile(join(ROOT_PATH, f))]
onlyfiles.remove("averagecalculatorv2.py")
return onlyfiles
def findaveragefunc():
indexforcalcu=-1
while True:
try:
totaltests=0
line_count=0
averagepercentagelist=[]
indexforcalcu=indexforcalcu+1
allclasses=getallclasses()
currentcalculate=allclasses[indexforcalcu]
classpath = os.path.join(ROOT_PATH, currentcalculate)
with open(classpath) as csv_file:
classscoredb = csv.reader(csv_file, delimiter=',')
for i, row in enumerate(classscoredb):
if line_count == 0:
while True:
try:
totaltests=totaltests+1
rowreader= {row[totaltests]}
except:
totaltests=totaltests-1
line_count = line_count + 1
break
else:
calculating_column_location=1
total=0
while True:
try:
total = total + int(row[calculating_column_location])
calculating_column_location = calculating_column_location + 1
except:
break
i=str(i)
name=row[0]
cleanname=removecharacters(nameforfunc=name)
namelist.append(cleanname)
findaveragenumbercal=findaverageofstudent(findaveragenumber=total,numoftests=totaltests)
averagepercentagelist.append(findaveragenumbercal)
line_count = line_count + 1
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + i + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
i=int(i)
except Exception as e:
print("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n",e)
break
def makenewclass():
global newclassname
getclassname=str(newclassname.get())
if getclassname == "":
messagebox.showerror("Error","The class name you have entered is invalid.")
else:
classname = getclassname + ".csv"
with open(classname, mode='w') as employee_file:
classwriter = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
classwriter.writerow(["Name","Test 1"])
root=tk.Tk()
root.title("Test result average finder")
findaveragebutton=tk.Button(root,text="Find Averages",command=findaveragefunc())
findaveragebutton.grid(row=2,column=2,padx=(10, 10),pady=(0,10))
classnamelabel=tk.Label(root, text="Class name:")
classnamelabel.grid(row=1, column=0,padx=(10,0),pady=(10,10))
newclassname = tk.Entry(root)
newclassname.grid(row=1,column=1,padx=(10, 10))
newclassbutton=tk.Button(root,text="Create new class",command=makenewclass)
newclassbutton.grid(row=1,column=2,padx=(0, 10),pady=(10,10))
root.mainloop()
Thanks in advance,
Sean
Use:
import glob, os
import pandas as pd
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
#extract all csv files to list
files = glob.glob(f'{ROOT_PATH}/*.csv')
print (files)
#create new folder if necessary
new = os.path.join(ROOT_PATH,'Averages')
if not os.path.exists(new):
os.makedirs(new)
#loop each file
for f in files:
#create DataFrame and convert first column to index
df = pd.read_csv(f, index_col=[0])
#count average in each row, rond and create one colum DataFrame
avg = df.mean(axis=1).round(1).to_frame('Average Percentage')
#remove index name if nncessary
avg.index.name = None
print (avg)
#create new path
head, tail = os.path.split(f)
path = os.path.join(head, 'Averages', tail)
print (path)
#write DataFrame to csv
avg.to_csv(path)
I have been trying to do some preprocessing on the Sentiment140 database on Kaggle: https://www.kaggle.com/kazanova/sentiment140
The code I'm using is this:
import os
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
print('Corpus size: {}'.format(len(corpus)))
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
tokenized_corpus = []
for i, tweet in enumerate(corpus):
tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('#')]
tokenized_corpus.append(tokens)
print(tokenized_corpus)
However, I keep getting this error:
TypeError: '_io.TextIOWrapper' object is not subscriptable
Can anyone help me understand how to solve the issue?
Thanks in advance
TL;DR
To read .csv or structured datasets, use pandas https://pandas.pydata.org/ or any other dataframe libraries.
In Long:
Instead of doing:
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
You could simply read the .csv file with pandas, e.g.
import pandas as pd
corpus = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')
Then use the .apply() function to process the tweets:
"""
Columns
====
target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
"""
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
header=None,
names=['target', 'ids', 'date', 'flag', 'user', 'text'],
encoding='latin-1')
tokenizer = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
def process_tweet(tweet):
return [stemmer.stem(token) if not token.startswith('#') else token
for token in tokenizer.tokenize(tweet)]
# 1. Cast the column type to string
# 2. Lowercase it
# 3. Iterate throw each row and get the output from process_tweet()
# 4. # 3. Keep in a new column call `tokenized_text`
df['tokenized_text']= df['text'].str.lower().apply(process_tweet)
I'm trying to read an input file in Scala that I know the structure of, however I only need every 9th entry. So far I have managed to read the whole thing using:
val lines = sc.textFile("hdfs://moonshot-ha-nameservice/" + args(0))
val fields = lines.map(line => line.split(","))
The issue, this leaves me with an array that is huge (we're talking 20GB of data). Not only have I seen myself forced to write some very ugly code in order to convert between RDD[Array[String]] and Array[String] but it's essentially made my code useless.
I've tried different approaches and mixes between using
.map()
.flatMap() and
.reduceByKey()
however nothing actually put my collected "cells" into the format that I need them to be.
Here's what is supposed to happen: Reading a folder of text files from our server, the code should read each "line" of text in the format:
*---------*
| NASDAQ: |
*---------*
exchange, stock_symbol, date, stock_price_open, stock_price_high, stock_price_low, stock_price_close, stock_volume, stock_price_adj_close
and only keep a hold of the stock_symbol as that is the identifier I'm counting. So far my attempts have been to turn the entire thing into an array only collect every 9th index from the first one into a collected_cells var. Issue is, based on my calculations and real life results, that code would take 335 days to run (no joke).
Here's my current code for reference:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SparkNum {
def main(args: Array[String]) {
// Do some Scala voodoo
val sc = new SparkContext(new SparkConf().setAppName("Spark Numerical"))
// Set input file as per HDFS structure + input args
val lines = sc.textFile("hdfs://moonshot-ha-nameservice/" + args(0))
val fields = lines.map(line => line.split(","))
var collected_cells:Array[String] = new Array[String](0)
//println("[MESSAGE] Length of CC: " + collected_cells.length)
val divider:Long = 9
val array_length = fields.count / divider
val casted_length = array_length.toInt
val indexedFields = fields.zipWithIndex
val indexKey = indexedFields.map{case (k,v) => (v,k)}
println("[MESSAGE] Number of lines: " + array_length)
println("[MESSAGE] Casted lenght of: " + casted_length)
for( i <- 1 to casted_length ) {
println("[URGENT DEBUG] Processin line " + i + " of " + casted_length)
var index = 9 * i - 8
println("[URGENT DEBUG] Index defined to be " + index)
collected_cells :+ indexKey.lookup(index)
}
println("[MESSAGE] collected_cells size: " + collected_cells.length)
val single_cells = collected_cells.flatMap(collected_cells => collected_cells);
val counted_cells = single_cells.map(cell => (cell, 1).reduceByKey{case (x, y) => x + y})
// val result = counted_cells.reduceByKey((a,b) => (a+b))
// val inmem = counted_cells.persist()
//
// // Collect driver into file to be put into user archive
// inmem.saveAsTextFile("path to server location")
// ==> Not necessary to save the result as processing time is recorded, not output
}
}
The bottom part is currently commented out as I tried to debug it, but it acts as pseudo-code for me to know what I need done. I may want to point out that I am next to not at all familiar with Scala and hence things like the _ notation confuse the life out of me.
Thanks for your time.
There are some concepts that need clarification in the question:
When we execute this code:
val lines = sc.textFile("hdfs://moonshot-ha-nameservice/" + args(0))
val fields = lines.map(line => line.split(","))
That does not result in a huge array of the size of the data. That expression represents a transformation of the base data. It can be further transformed until we reduce the data to the information set we desire.
In this case, we want the stock_symbol field of a record encoded a csv:
exchange, stock_symbol, date, stock_price_open, stock_price_high, stock_price_low, stock_price_close, stock_volume, stock_price_adj_close
I'm also going to assume that the data file contains a banner like this:
*---------*
| NASDAQ: |
*---------*
The first thing we're going to do is to remove anything that looks like this banner. In fact, I'm going to assume that the first field is the name of a stock exchange that start with an alphanumeric character. We will do this before we do any splitting, resulting in:
val lines = sc.textFile("hdfs://moonshot-ha-nameservice/" + args(0))
val validLines = lines.filter(line => !line.isEmpty && line.head.isLetter)
val fields = validLines.map(line => line.split(","))
It helps to write the types of the variables, to have peace of mind that we have the data types that we expect. As we progress in our Scala skills that might become less important. Let's rewrite the expression above with types:
val lines: RDD[String] = sc.textFile("hdfs://moonshot-ha-nameservice/" + args(0))
val validLines: RDD[String] = lines.filter(line => !line.isEmpty && line.head.isLetter)
val fields: RDD[Array[String]] = validLines.map(line => line.split(","))
We are interested in the stock_symbol field, which positionally is the element #1 in a 0-based array:
val stockSymbols:RDD[String] = fields.map(record => record(1))
If we want to count the symbols, all that's left is to issue a count:
val totalSymbolCount = stockSymbols.count()
That's not very helpful because we have one entry for every record. Slightly more interesting questions would be:
How many different stock symbols we have?
val uniqueStockSymbols = stockSymbols.distinct.count()
How many records for each symbol do we have?
val countBySymbol = stockSymbols.map(s => (s,1)).reduceByKey(_+_)
In Spark 2.0, CSV support for Dataframes and Datasets is available out of the box
Given that our data does not have a header row with the field names (what's usual in large datasets), we will need to provide the column names:
val stockDF = sparkSession.read.csv("/tmp/quotes_clean.csv").toDF("exchange", "symbol", "date", "open", "close", "volume", "price")
We can answer our questions very easy now:
val uniqueSymbols = stockDF.select("symbol").distinct().count
val recordsPerSymbol = stockDF.groupBy($"symbol").agg(count($"symbol"))
I'm currently trying to filter a large database using scala. I've written a simple piece of code to match an ID in one database to a list of ID's in another.
Essentially I want to go through database A and if the ID number in the ID column matches one from database B, to extract that entry from Database A.
The code i've written works fine, but it's slow (i.e. has to run over a couple of days) and i'm trying to find a way to speed it up. It may be that it can't be sped up by much, or it can be much much faster with better coding.
So any help would be much appreciated.
Below is a description of the databases and a copy of the code.
Database A is approximately 10gb in size with over 100 million entries and database B has a list of approx 50,000 IDs.
Each database looks like as follows:
Database A:
ID, DataX, date
10, 100,01012000
15, 20, 01012008
5, 32, 01012006
etc...
Database B:
ID
10
15
12
etc...
My code is as follows:
import scala.io.Source
import java.io._
object filter extends App {
def ext[T <: Closeable, R](resource: T)(block: T => R): R = {
try { block(resource) }
finally { resource.close() }
}
val key = io.Source.fromFile("C:\\~Database_B.csv").getLines()
val key2 = new Array[String](50000)
key.copyToArray(key2)
ext(new BufferedWriter(new OutputStreamWriter(new FileOutputStream("C:\\~Output.csv")))) {
writer =>
val line = io.Source.fromFile("C:\\~Database_A.csv").getLines.drop(1)
while (line.hasNext) {
val data= line.next
val array = data.split(",").map(_.trim)
val idA = array(0)
val dataX = array(1)
val date = array(2)
key2.map { idB =>
if (idA == idB) {
val print = (idA + "," + dataX + "," + date)
writer.write(print)
writer.newLine()
} else None
}
}
}
}
First, there are way more efficient ways to do that than writing a Scala program. Loading two tables in a database and do a join will take about 10 minutes (including data loading) on a modern computer.
Assuming you have to use scala, there is an obvious improvement. Store you keys as a HashSet and use keys.contains(x) instead of traversing all keys. This would give you O(1) lookup instead of O(N) that you have now, which should speed up your program significantly.
Minor point -- use string interpolation instead of concatenation, i.e.
s"$idA,$dataX,$date"
// instead of
idA + "," + dataX + "," + date
Try this:
import scala.io.Source
import java.io._
object filter extends App {
def ext[T <: Closeable, R](resource: T)(block: T => R): R = {
try { block(resource) }
finally { resource.close() }
}
// convert to a Set
val key2 = io.Source.fromFile("C:\\~Database_B.csv").getLines().toSet
ext(new BufferedWriter(new OutputStreamWriter(new FileOutputStream("C:\\~Output.csv")))) {
writer =>
val lines = io.Source.fromFile("C:\\~Database_A.csv").getLines.drop(1)
for (data <- lines) {
val array = data.split(",").map(_.trim)
array match {
case Array(idA, dataX, date) =>
if (key2.contains(idA)) {
val print = (idA + "," + dataX + "," + date)
writer.write(print)
writer.newLine()
}
case _ => // invalid input
}
}
}
}
IDs are now stored in a set. This will give a better performance.