App executes but the range doesn't. In my CSV file, it only shows the first entry. I've also come across index out of range errors when scraping other fields. Any help would be appreciated. I'm learning.
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open('salons.csv', 'w'))
f.writerow(['Name'])
pages = []
for i in range(0, 10600):
url = 'http://www.aveda.com/locator/get_the_facts.tmpl?SalonID=' + str(i) +' '
pages.append(url)
for item in pages:
page = requests.get(item)
soup = BeautifulSoup(page.text, 'lxml')
salon_name_list = soup.find(class_='getthefacts__store_meta_info--store_phone')
salon_name_list_items = salon_name_list.find_all('li', class_='phone')
for salon_name in salon_name_list_items:
names = salon_name.contents[0]
f.writerow([names])
The way you tried to find phone numbers is not how you should do. Phone numbers are within a tag under class name phone. Try this instead. It will fetch you the phone numbers you are interested in:
import requests ; import csv
from bs4 import BeautifulSoup
outfile = open('salons.csv','w')
writer = csv.writer(outfile)
writer.writerow(['Name'])
for i in range(0, 10600):
url = 'http://www.aveda.com/locator/get_the_facts.tmpl?SalonID={0}'.format(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
for salon_name in soup.select('.phone a'):
names = salon_name.text
print(names)
writer.writerow([names])
outfile.close()
Not sure how you have indented your code. Format it properly in the question. And you may not need two for loops.
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open('salons.csv', 'w'))
f.writerow(['Name'])
for i in range(0, 10600):
url = 'http://www.aveda.com/locator/get_the_facts.tmpl?SalonID=' + str(i) +'/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
salon_name_list = soup.find(class_='getthefacts__store_meta_info--store_phone')
salon_name_list_items = salon_name_list.find_all('li', class_='phone')
for salon_name in salon_name_list_items:
names = salon_name.contents[0]
f.writerow([names])
Related
I have written this code. I want to get all the data from all the pages and store them in a CSV file. But I don't know what to do next. I can do this in beautifulsoup but it's not the same in selenium and beautifulsoup combination
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
from bs4 import BeautifulSoup as bs
import pandas
chrome_option = Options()
chrome_option.add_argument("--headless")
browser = webdriver.Chrome(executable_path="D:/chromedriver.exe", chrome_options=chrome_option)
# list of data
names = []
license_num = []
types = []
contacts = []
links = []
def scrape(url):
browser.get(url)
sleep(5)
html = browser.execute_script("return document.documentElement.outerHTML")
sel_soup = bs(html, "html.parser")
containers = sel_soup.find_all(class_ = "d-agent-card container_13WXz card_2AIgF")
for cont in containers:
# agent name
agent = cont.find("h4").text.strip()
names.append(agent)
# agent type
tp = cont.find("h5").text.strip()
types.append(tp)
# agent contact
contact = cont.find("button", {"class": "button_3pYtF icon-left_1xpTg secondary_KT7Sy link_-iSRx contactLink_BgG5h"})
if contact is not None:
contacts.append(contact.text.strip())
elif contact is None:
contacts.append("None")
# agent link
link = cont.find("div", {"class": "linksContainer_1-v7q"}).find("a")
if link is not None:
links.append(link["href"])
elif link is None:
links.append("None")
# license
licns = cont.find("p", {"class": "license_33m8Z"}).text
license_num.append(licns)
for page in range(1, 27):
urls = f"https://www.remax.com/real-estate-agents/Dallas-TX?page={page}"
scrape(urls)
df = pandas.DataFrame({
"Agent Name": names,
"Agent Type" : types,
"Agent License Number": license_num,
"Agent contact Number": contacts,
"Agent URL": links
})
df.to_csv("data.csv", index=False)
With this I am only getting 596 rows of data. But I want to get 24x25+ 19 = 619 rows. Every page has 24 rows of data. I want to get them. But I am only getting the data of 23 pages maybe. now I am getting an error ...
{"QTM: JSEvent: TypeError: Cannot read property 'data' of undefined !!window.QuantumMetricAPI.lastXHR.data && !!JSON.parse(QuantumMetricAPI.lastXHR.data).term"}
So I have a very simply Tkinter GUI which takes an input parameter and inputs it into a SQLite database. I'm looking to create a secondary GUI which will extract this parameter from the SQLite database and display it on the secondary GUI. Can you please help on how to do this? Preferably I want to display this data from the DB on a text field or something of the like.
from Tkinter import *
from PIL import Image, ImageTk
import sqlite3
root = Tk()
root.wm_attributes('-fullscreen','true')
root.title("My Test GUI")
Fullname=StringVar()
conn = sqlite3.connect('Form.db')
cursor=conn.cursor()
def database():
name1=Fullname.get()
cursor.execute('CREATE TABLE IF NOT EXISTS Student (Fullname TEXT)')
cursor.execute('INSERT INTO Student (FullName) VALUES(?)',(name1,))
conn.commit()
def error():
root1 = Toplevel(root)
root1.geometry("150x90")
root1.title("Warning")
Label(root1, text = "All fields required", fg = "red").pack()
def read_from_db():
cursor.execute('SELECT * FROM Student')
data = cursor.fetchall()
print(data)
label_0 = Label(root, text="My Test GUI",width=20,font=("bold", 20))
label_0.place(x=650,y=53)
label_1 = Label(root, text="Name",width=20,font=("bold", 10))
label_1.place(x=550,y=130)
entry_1 = Entry(root,textvar=Fullname)
entry_1.place(x=700,y=130)
Button(root, text='Submit',width=20,bg='brown',fg='white', command=database).place(x=650,y=380)
root.mainloop()
read_from_db()
Within your read_from_db function, instead of printing the value of data you can make a label out of it:
def read_from_db():
cursor.execute("SELECT *, oid FROM Student")
data = c.fetchall()
showData = ''
for data in Student:
showData += str(data) + "\n"
dataLabel = Label(master, text=showData)
playerLabel.grid(row=0, column=0)
conn.commit()
conn.close()
I'm currently writing some code and am using pandas to export all of the data into csv files. My program runs multiple iterations until it has gone through all of the necessary files. Pandas is re-writing one file each iteration but when it moves onto the next file I need it to reset all of the data (I think).
Structure is roughly:
While loop>a few variables are named>program runs>dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
This part works with no problem for one file. When moving onto the next file, all of the arrays I use are reset and this I think is why pandas gives the error Shape of passed values is (1,1), indices imply (3,1).
Please let me know if I need to explain it better.
EDIT:
While True:
try:
averagepercentagelist=[]
namelist=[]
columns=[]
for row in database:
averagepercentagelist=["12","23"]
namelist=["Name0","Name1"]
columns=["Average percentage"]
dataframe=(pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
except Exception as e:
print e
break
SNIPPET:
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
FULL PROGRAM SO FAR:
import csv
import os
import re
import pandas
import tkinter as tk
from tkinter import messagebox
from os.path import isfile, join
from os import listdir
import time
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
indexforcalcu=0
line_count=0
testlist=[]
namelist=[]
header=['Average Percentage']
def clearvariables():
indexforcalcu=0
testlist=[]
def findaverageofstudent(findaveragenumber,numoftests):
total=0
findaveragenumber = findaveragenumber/numoftests
findaveragenumber = round(findaveragenumber, 1)
return findaveragenumber
def removecharacters(nameforfunc):
nameforfunc=str(nameforfunc)
elem=re.sub("[{'}]", "",nameforfunc)
return elem
def getallclasses():
onlyfiles = [f for f in listdir(ROOT_PATH) if isfile(join(ROOT_PATH, f))]
onlyfiles.remove("averagecalculatorv2.py")
return onlyfiles
def findaveragefunc():
indexforcalcu=-1
while True:
try:
totaltests=0
line_count=0
averagepercentagelist=[]
indexforcalcu=indexforcalcu+1
allclasses=getallclasses()
currentcalculate=allclasses[indexforcalcu]
classpath = os.path.join(ROOT_PATH, currentcalculate)
with open(classpath) as csv_file:
classscoredb = csv.reader(csv_file, delimiter=',')
for i, row in enumerate(classscoredb):
if line_count == 0:
while True:
try:
totaltests=totaltests+1
rowreader= {row[totaltests]}
except:
totaltests=totaltests-1
line_count = line_count + 1
break
else:
calculating_column_location=1
total=0
while True:
try:
total = total + int(row[calculating_column_location])
calculating_column_location = calculating_column_location + 1
except:
break
i=str(i)
name=row[0]
cleanname=removecharacters(nameforfunc=name)
namelist.append(cleanname)
findaveragenumbercal=findaverageofstudent(findaveragenumber=total,numoftests=totaltests)
averagepercentagelist.append(findaveragenumbercal)
line_count = line_count + 1
dataframe= (pandas.DataFrame(averagepercentagelist,index=namelist,columns=header))
currentcalculatedatafrane = 'averages' + i + currentcalculate
dataframeexportpath = os.path.join(ROOT_PATH,'Averages',currentcalculatedatafrane)
dataframe.to_csv(dataframeexportpath)
i=int(i)
except Exception as e:
print("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n",e)
break
def makenewclass():
global newclassname
getclassname=str(newclassname.get())
if getclassname == "":
messagebox.showerror("Error","The class name you have entered is invalid.")
else:
classname = getclassname + ".csv"
with open(classname, mode='w') as employee_file:
classwriter = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
classwriter.writerow(["Name","Test 1"])
root=tk.Tk()
root.title("Test result average finder")
findaveragebutton=tk.Button(root,text="Find Averages",command=findaveragefunc())
findaveragebutton.grid(row=2,column=2,padx=(10, 10),pady=(0,10))
classnamelabel=tk.Label(root, text="Class name:")
classnamelabel.grid(row=1, column=0,padx=(10,0),pady=(10,10))
newclassname = tk.Entry(root)
newclassname.grid(row=1,column=1,padx=(10, 10))
newclassbutton=tk.Button(root,text="Create new class",command=makenewclass)
newclassbutton.grid(row=1,column=2,padx=(0, 10),pady=(10,10))
root.mainloop()
Thanks in advance,
Sean
Use:
import glob, os
import pandas as pd
ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
#extract all csv files to list
files = glob.glob(f'{ROOT_PATH}/*.csv')
print (files)
#create new folder if necessary
new = os.path.join(ROOT_PATH,'Averages')
if not os.path.exists(new):
os.makedirs(new)
#loop each file
for f in files:
#create DataFrame and convert first column to index
df = pd.read_csv(f, index_col=[0])
#count average in each row, rond and create one colum DataFrame
avg = df.mean(axis=1).round(1).to_frame('Average Percentage')
#remove index name if nncessary
avg.index.name = None
print (avg)
#create new path
head, tail = os.path.split(f)
path = os.path.join(head, 'Averages', tail)
print (path)
#write DataFrame to csv
avg.to_csv(path)
import urllib2
import pandas as pd
from bs4 import BeautifulSoup
x = 0
i = 1
data = []
while (i < 13):
soup = BeautifulSoup(urllib2.urlopen(
'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex=' % i, +str(x)).read(), 'html')
tableStats = soup.find("table", ("class", "playerTableTable tableBody"))
for row in tableStats.findAll('tr')[2:]:
col = row.findAll('td')
try:
name = col[0].a.string.strip()
opp = col[1].a.string.strip()
rec = col[10].string.strip()
yds = col[11].string.strip()
dt = col[12].string.strip()
pts = col[13].string.strip()
data.append([name, opp, rec, yds, dt, pts])
except Exception as e:
pass
df = pd.DataFrame(data=data, columns=[
'PLAYER', 'OPP', 'REC', 'YDS', 'TD', 'PTS'])
df
i += 1
I have been working with a fantasy football program and I am trying to increment data over all weeks so I can create a dataframe for the top 40 players for each week.
I have been able to get it for any week of my choice by manually entering the week number in the PeriodId part of the url, but I am trying to programmatically increment it over each week to make it easier. I have tried using PeriodId='+ I +' and PeriodId=%d but I keep getting various errors about str and int concatenate and bad operands. Any suggestions or tips?
Try removing the comma between %i and str(x) to concatenate the strings and see if that helps.
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i, +str(x)).read(), 'html')
should be:
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i +str(x)).read(), 'html')
if you have problem concatenating or formatting URL please create variable instead write it one line with BeautifulSoup and urllib2.urlopen.
Use parenthesis to format with multiple value like "before %s is %s" % (1, 0)
url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=%s' % (i, x)
# or
#url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=0' % i
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
Make the code sorter will not effect the performance.
I cant find Alchemy Language API in IBM Watson.
Can I do this with natural-language-understanding service and how?
When I add
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EntitiesOptions, KeywordsOptions
It shows some error with combined keyword
# In[]:
import tweepy
import re
import time
import math
import pandas as pd
from watson_developer_cloud import AlchemyLanguageV1
def initAlchemy():
al = AlchemyLanguageV1(api_key='GRYVUMdBbOtJXxNOIs1aopjjaiyOmLG7xJBzkAnvvwLh')
return al
def initTwitterApi():
consumer_key = 'OmK1RrZCVJSRmKxIuQqkBExvw'
consumer_secret = 'VWn6OR4rRgSi7qGnZHCblJMhrSvj1QbJmf0f62uX6ZQWZUUx5q'
access_token = '4852231552-adGooMpTB3EJYPHvs6oGZ40qlo3d2JbVjqUUWkJ'
access_token_secret = 'm9hgeM9p0r1nn8IoQWJYBs5qUQu56XmrAhsDSYKjuiVA4'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
return api
'''This function is implemented to handle tweepy exception errors
because search is rate limited at 180 queries per 15 minute window by twitter'''
def limit(cursor):
while True:
try:
yield cursor.next()
except tweepy.TweepError as error:
print(repr(error))
print("Twitter Request limit error reached sleeping for 15 minutes")
time.sleep(16*60)
except tweepy.RateLimitError:
print("Rate Limit Error occurred Sleeping for 16 minutes")
time.sleep(16*60)
def retrieveTweets(api, search, lim):
if(lim == ""):
lim = math.inf
else:
lim = int(lim)
text = []
for tweet in limit(tweepy.Cursor(api.search, q=search).items(limit = lim)):
t = re.sub('\s+', ' ', tweet.text)
text.append(t)
data = {"Tweet":text,
"Sentiment":"",
"Score":""}
dataFrame = pd.DataFrame(data, columns=["Tweet","Sentiment","Score"])
return dataFrame
def analyze(al,dataFrame):
sentiment = []
score = []
for i in range(0, dataFrame["Tweet"].__len__()):
res = al.combined(text=dataFrame["Tweet"][i],
extract="doc-sentiment",
sentiment=1)
sentiment.append(res["docSentiment"]["type"])
if(res["docSentiment"]["type"] == "neutral"):
score.append(0)
else:
score.append(res["docSentiment"]["score"])
dataFrame["Sentiment"] = sentiment
dataFrame["Score"] = score
return dataFrame
def main():
#Initialse Twitter Api
api = initTwitterApi()
#Retrieve tweets
dataFrame = retrieveTweets(api,input("Enter the search query (e.g. #hillaryclinton ) : "), input("Enter limit for number of tweets to be searched or else just hit enter : "))
#Initialise IBM Watson Alchemy Language Api
al = initAlchemy()
#Do Document Sentiment analysis
dataFrame = analyze(al, dataFrame)
#Save tweets, sentiment, and score data frame in csv file
dataFrame.to_csv(input("Enter the name of the file (with .csv extension) : "))
if __name__ == '__main__':
main()# -*- coding: utf-8 -*-
The Watson Natural Language Understanding only has a combined call, but since it is the only call, it isn't called combined, its actually analyze. Best place to go for details would be the API documentation - https://www.ibm.com/watson/developercloud/natural-language-understanding/api/v1/?python#post-analyze