How to crawl over all the webpages in selenium? - selenium-webdriver

I have written this code. I want to get all the data from all the pages and store them in a CSV file. But I don't know what to do next. I can do this in beautifulsoup but it's not the same in selenium and beautifulsoup combination
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
from bs4 import BeautifulSoup as bs
import pandas
chrome_option = Options()
chrome_option.add_argument("--headless")
browser = webdriver.Chrome(executable_path="D:/chromedriver.exe", chrome_options=chrome_option)
# list of data
names = []
license_num = []
types = []
contacts = []
links = []
def scrape(url):
browser.get(url)
sleep(5)
html = browser.execute_script("return document.documentElement.outerHTML")
sel_soup = bs(html, "html.parser")
containers = sel_soup.find_all(class_ = "d-agent-card container_13WXz card_2AIgF")
for cont in containers:
# agent name
agent = cont.find("h4").text.strip()
names.append(agent)
# agent type
tp = cont.find("h5").text.strip()
types.append(tp)
# agent contact
contact = cont.find("button", {"class": "button_3pYtF icon-left_1xpTg secondary_KT7Sy link_-iSRx contactLink_BgG5h"})
if contact is not None:
contacts.append(contact.text.strip())
elif contact is None:
contacts.append("None")
# agent link
link = cont.find("div", {"class": "linksContainer_1-v7q"}).find("a")
if link is not None:
links.append(link["href"])
elif link is None:
links.append("None")
# license
licns = cont.find("p", {"class": "license_33m8Z"}).text
license_num.append(licns)
for page in range(1, 27):
urls = f"https://www.remax.com/real-estate-agents/Dallas-TX?page={page}"
scrape(urls)
df = pandas.DataFrame({
"Agent Name": names,
"Agent Type" : types,
"Agent License Number": license_num,
"Agent contact Number": contacts,
"Agent URL": links
})
df.to_csv("data.csv", index=False)
With this I am only getting 596 rows of data. But I want to get 24x25+ 19 = 619 rows. Every page has 24 rows of data. I want to get them. But I am only getting the data of 23 pages maybe. now I am getting an error ...
{"QTM: JSEvent: TypeError: Cannot read property 'data' of undefined !!window.QuantumMetricAPI.lastXHR.data && !!JSON.parse(QuantumMetricAPI.lastXHR.data).term"}

Related

Problem with saving pickle object into arrays from images in python

I have the following class for loading and converting my images into train and test arrays for a deep learning model in Tensorflow 2.
The images are in three folders, named 'Car', 'Cat' and 'Man', which are within the Train and Test main folders. Each image is of 300 x 400 pixels.
import os
import pickle
import cv2
import numpy as np
os.getcwd()
out: 'C:\\Users\\me\\Jupiter_Notebooks\\Dataset_Thermal\\SeekThermal'
path_train = "../SeekThermal/Train"
path_test = "../SeekThermal/Test"
class MasterImage(object):
def __init__(self,PATH='', IMAGE_SIZE = 50):
self.PATH = PATH
self.IMAGE_SIZE = IMAGE_SIZE
self.image_data = []
self.x_data = []
self.y_data = []
self.CATEGORIES = []
# This will get List of categories
self.list_categories = []
def get_categories(self):
for path in os.listdir(self.PATH):
if '.DS_Store' in path:
pass
else:
self.list_categories.append(path)
print("Found Categories ",self.list_categories,'\n')
return self.list_categories
def process_image(self):
try:
"""
Return Numpy array of image
:return: X_Data, Y_Data
"""
self.CATEGORIES = self.get_categories()
for categories in self.CATEGORIES: # Iterate over categories
train_folder_path = os.path.join(self.PATH, categories) # Folder Path
class_index = self.CATEGORIES.index(categories) # this will get index for classification
for img in os.listdir(train_folder_path): # This will iterate in the Folder
new_path = os.path.join(train_folder_path, img) # image Path
try: # if any image is corrupted
image_data_temp = cv2.imread(new_path) # Read Image as numbers
image_temp_resize = cv2.resize(image_data_temp,(self.IMAGE_SIZE,self.IMAGE_SIZE))
self.image_data.append([image_temp_resize,class_index])
random.shuffle(self.image_data)
except:
pass
data = np.asanyarray(self.image_data) # or, data = np.asanyarray(self.image_data,dtype=object)
# Iterate over the Data
for x in data:
self.x_data.append(x[0]) # Get the X_Data
self.y_data.append(x[1]) # get the label
X_Data = np.asarray(self.x_data) / (255.0) # Normalize Data
Y_Data = np.asarray(self.y_data)
# reshape x_Data
X_Data = X_Data.reshape(-1, self.IMAGE_SIZE, self.IMAGE_SIZE, 3)
return X_Data, Y_Data
except:
print("Failed to run Function Process Image ")
def pickle_image(self):
"""
:return: None Creates a Pickle Object of DataSet
"""
# Call the Function and Get the Data
X_Data,Y_Data = self.process_image()
# Write the Entire Data into a Pickle File
pickle_out = open('X_Data','wb')
pickle.dump(X_Data, pickle_out)
pickle_out.close()
# Write the Y Label Data
pickle_out = open('Y_Data', 'wb')
pickle.dump(Y_Data, pickle_out)
pickle_out.close()
print("Pickled Image Successfully ")
return X_Data,Y_Data
def load_dataset(self):
try:
# Read the Data from Pickle Object
X_Temp = open('..\SeekThermal\X_Data','rb')
X_Data = pickle.load(X_Temp)
Y_Temp = open('..\SeekThermal\Y_Data','rb')
Y_Data = pickle.load(Y_Temp)
print('Reading Dataset from Pickle Object')
return X_Data,Y_Data
except:
print('Could not Found Pickle File ')
print('Loading File and Dataset ..........')
X_Data,Y_Data = self.pickle_image()
return X_Data,Y_Data
I dont understand what the problem is with the pickle file, because just last week I able to create these arrays successfully with the same code??
Is there an easier way to load images in Tensorflow rather than through the custom defined class?
a = MasterImage(PATH = path_train,IMAGE_SIZE = 224)
a.process_image()
out:
it produces an array with a warning.
VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
data = np.asanyarray(self.image_data)
a.pickle_image()
out:
TypeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1692\507657192.py in <cell line: 1>()
----> 1 a.pickle_image()
~\AppData\Local\Temp\ipykernel_1692\1410849712.py in pickle_image(self)
71 """
72 # Call the Function and Get the Data
---> 73 X_Data,Y_Data = self.process_image()
74
75 # Write the Entire Data into a Pickle File
TypeError: cannot unpack non-iterable NoneType object
a.load_dataset()
out:
Could not Found Pickle File
Loading File and Dataset ..........
Found Categories ['Car', 'Cat', 'Man', 'Car', 'Cat', 'Man']
Pickled Image Successfully
I'm running Python 3.8.8 via anaconda on Windows 10. Thank you for any advice.

Sentiment140 Preprocessing

I have been trying to do some preprocessing on the Sentiment140 database on Kaggle: https://www.kaggle.com/kazanova/sentiment140
The code I'm using is this:
import os
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
print('Corpus size: {}'.format(len(corpus)))
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
tokenized_corpus = []
for i, tweet in enumerate(corpus):
tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('#')]
tokenized_corpus.append(tokens)
print(tokenized_corpus)
However, I keep getting this error:
TypeError: '_io.TextIOWrapper' object is not subscriptable
Can anyone help me understand how to solve the issue?
Thanks in advance
TL;DR
To read .csv or structured datasets, use pandas https://pandas.pydata.org/ or any other dataframe libraries.
In Long:
Instead of doing:
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
You could simply read the .csv file with pandas, e.g.
import pandas as pd
corpus = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')
Then use the .apply() function to process the tweets:
"""
Columns
====
target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
"""
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
header=None,
names=['target', 'ids', 'date', 'flag', 'user', 'text'],
encoding='latin-1')
tokenizer = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
def process_tweet(tweet):
return [stemmer.stem(token) if not token.startswith('#') else token
for token in tokenizer.tokenize(tweet)]
# 1. Cast the column type to string
# 2. Lowercase it
# 3. Iterate throw each row and get the output from process_tweet()
# 4. # 3. Keep in a new column call `tokenized_text`
df['tokenized_text']= df['text'].str.lower().apply(process_tweet)

WatsonApiException: Error: invalid-api-key, Code: 401

I cant find Alchemy Language API in IBM Watson.
Can I do this with natural-language-understanding service and how?
When I add
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EntitiesOptions, KeywordsOptions
It shows some error with combined keyword
# In[]:
import tweepy
import re
import time
import math
import pandas as pd
from watson_developer_cloud import AlchemyLanguageV1
def initAlchemy():
al = AlchemyLanguageV1(api_key='GRYVUMdBbOtJXxNOIs1aopjjaiyOmLG7xJBzkAnvvwLh')
return al
def initTwitterApi():
consumer_key = 'OmK1RrZCVJSRmKxIuQqkBExvw'
consumer_secret = 'VWn6OR4rRgSi7qGnZHCblJMhrSvj1QbJmf0f62uX6ZQWZUUx5q'
access_token = '4852231552-adGooMpTB3EJYPHvs6oGZ40qlo3d2JbVjqUUWkJ'
access_token_secret = 'm9hgeM9p0r1nn8IoQWJYBs5qUQu56XmrAhsDSYKjuiVA4'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
return api
'''This function is implemented to handle tweepy exception errors
because search is rate limited at 180 queries per 15 minute window by twitter'''
def limit(cursor):
while True:
try:
yield cursor.next()
except tweepy.TweepError as error:
print(repr(error))
print("Twitter Request limit error reached sleeping for 15 minutes")
time.sleep(16*60)
except tweepy.RateLimitError:
print("Rate Limit Error occurred Sleeping for 16 minutes")
time.sleep(16*60)
def retrieveTweets(api, search, lim):
if(lim == ""):
lim = math.inf
else:
lim = int(lim)
text = []
for tweet in limit(tweepy.Cursor(api.search, q=search).items(limit = lim)):
t = re.sub('\s+', ' ', tweet.text)
text.append(t)
data = {"Tweet":text,
"Sentiment":"",
"Score":""}
dataFrame = pd.DataFrame(data, columns=["Tweet","Sentiment","Score"])
return dataFrame
def analyze(al,dataFrame):
sentiment = []
score = []
for i in range(0, dataFrame["Tweet"].__len__()):
res = al.combined(text=dataFrame["Tweet"][i],
extract="doc-sentiment",
sentiment=1)
sentiment.append(res["docSentiment"]["type"])
if(res["docSentiment"]["type"] == "neutral"):
score.append(0)
else:
score.append(res["docSentiment"]["score"])
dataFrame["Sentiment"] = sentiment
dataFrame["Score"] = score
return dataFrame
def main():
#Initialse Twitter Api
api = initTwitterApi()
#Retrieve tweets
dataFrame = retrieveTweets(api,input("Enter the search query (e.g. #hillaryclinton ) : "), input("Enter limit for number of tweets to be searched or else just hit enter : "))
#Initialise IBM Watson Alchemy Language Api
al = initAlchemy()
#Do Document Sentiment analysis
dataFrame = analyze(al, dataFrame)
#Save tweets, sentiment, and score data frame in csv file
dataFrame.to_csv(input("Enter the name of the file (with .csv extension) : "))
if __name__ == '__main__':
main()# -*- coding: utf-8 -*-
The Watson Natural Language Understanding only has a combined call, but since it is the only call, it isn't called combined, its actually analyze. Best place to go for details would be the API documentation - https://www.ibm.com/watson/developercloud/natural-language-understanding/api/v1/?python#post-analyze

python: range not being executed

App executes but the range doesn't. In my CSV file, it only shows the first entry. I've also come across index out of range errors when scraping other fields. Any help would be appreciated. I'm learning.
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open('salons.csv', 'w'))
f.writerow(['Name'])
pages = []
for i in range(0, 10600):
url = 'http://www.aveda.com/locator/get_the_facts.tmpl?SalonID=' + str(i) +' '
pages.append(url)
for item in pages:
page = requests.get(item)
soup = BeautifulSoup(page.text, 'lxml')
salon_name_list = soup.find(class_='getthefacts__store_meta_info--store_phone')
salon_name_list_items = salon_name_list.find_all('li', class_='phone')
for salon_name in salon_name_list_items:
names = salon_name.contents[0]
f.writerow([names])
The way you tried to find phone numbers is not how you should do. Phone numbers are within a tag under class name phone. Try this instead. It will fetch you the phone numbers you are interested in:
import requests ; import csv
from bs4 import BeautifulSoup
outfile = open('salons.csv','w')
writer = csv.writer(outfile)
writer.writerow(['Name'])
for i in range(0, 10600):
url = 'http://www.aveda.com/locator/get_the_facts.tmpl?SalonID={0}'.format(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
for salon_name in soup.select('.phone a'):
names = salon_name.text
print(names)
writer.writerow([names])
outfile.close()
Not sure how you have indented your code. Format it properly in the question. And you may not need two for loops.
import requests
import csv
from bs4 import BeautifulSoup
f = csv.writer(open('salons.csv', 'w'))
f.writerow(['Name'])
for i in range(0, 10600):
url = 'http://www.aveda.com/locator/get_the_facts.tmpl?SalonID=' + str(i) +'/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
salon_name_list = soup.find(class_='getthefacts__store_meta_info--store_phone')
salon_name_list_items = salon_name_list.find_all('li', class_='phone')
for salon_name in salon_name_list_items:
names = salon_name.contents[0]
f.writerow([names])

ValueError: Unknown protobuf attr type <type 'datetime.date'>

Getting an error in executing the code. I have a datastore entity which has a property of type Date. An example date property value stored in an entity for a particular row is 2016-01-03 (19:00:00.000) EDT
The code i am executing is filtering the entity values based on date greater than 2016-01-01. Any idea what is wrong with the code
Error
ValueError: Unknown protobuf attr type <type 'datetime.date'>
Code
import pandas as pd
import numpy as np
from datetime import datetime
from google.cloud import datastore
from flask import Flask,Blueprint
app = Flask(__name__)
computation_cron= Blueprint('cron.stock_data_transformation', __name__)
#computation_cron.route('/cron/stock_data_transformation')
def cron():
ds = datastore.Client(project="earningspredictor-173913")
query = ds.query(kind='StockPrice')
query.add_filter('date', '>', datetime.strptime("2016-01-01", '%Y-%m-%d').date())
dataframe_data = []
temp_dict = {}
for q in query.fetch():
temp_dict["stock_code"] = q["stock_code"]
temp_dict["date"] = q["date"]
temp_dict["ex_dividend"] = q["ex_dividend"]
temp_dict["split_ratio"] = q["split_ratio"]
temp_dict["adj_open"] = q["adj_open"]
temp_dict["adj_high"] = q["adj_high"]
temp_dict["adj_low"] = q["adj_low"]
temp_dict["adj_close"] = q["adj_close"]
temp_dict["adj_volume"] = q["adj_volume"]
dataframe_data.append(temp_dict)
sph = pd.DataFrame(data=dataframe_data,columns=temp_dict.keys())
# print sph.to_string()
query = ds.query(kind='EarningsSurprise')
query.add_filter('act_rpt_date', '>', datetime.strptime("2016-01-01", '%Y-%m-%d').date())
dataframe_data = []
temp_dict = {}
for q in query.fetch():
temp_dict["stock_code"] = q["stock_code"]
temp_dict["eps_amount_diff"] = q["eps_amount_diff"]
temp_dict["eps_actual"] = q["eps_actual"]
temp_dict["act_rpt_date"] = q["act_rpt_date"]
temp_dict["act_rpt_code"] = q["act_rpt_code"]
temp_dict["eps_percent_diff"] = q["eps_percent_diff"]
dataframe_data.append(temp_dict)
es = pd.DataFrame(data=dataframe_data,columns=temp_dict.keys())
You seem to be using the generic google-cloud-datastore client library, not the NDB Client Library.
For google-cloud-datastore all date and/or time properties have the same format. From Date and time:
JSON
field name: timestampValue
type: string (RFC 3339 formatted, with milliseconds, for instance 2013-05-14T00:01:00.234Z)
Protocol buffer
field name: timestamp_value
type: Timestamp
Sort order: Chronological
Notes: When stored in Cloud Datastore, precise only to microseconds; any additional precision is rounded down.
So when setting/comparing such properties try to use strings formatted as specified (or integers for protobuf Timestamp?), not directly objects from the datetime modules (which work with the NDB library). The same might be true for queries as well.
Note: this is based on documentation only, I didn't use the generic library myself.

Resources