Scrapy Database only returning one line - database

I am beginner and looked for the answer on the web but I am not able to do so. Asking it here seems the last resort
I am trying to receive multiple lines in a table in a database. However its only returning one line. See: https://imgur.com/a/OdUSVL3
If the database functions are left out and its converted into a csv it does show all the data.
Spider code:
# -*- coding: utf-8 -*-
import scrapy
class FundaSpiderSpider(scrapy.Spider):
name = 'fundaspider'
start_urls = [
'file:///Users/kevinvanhoutum/Dropbox/Shared%20Data%20Mining/FundaOfflineHTML.html'
]
def parse(self, response):
yield{
'zipcode' : response.css('.search-result-subtitle').css('::text').extract(),
'asking_price' : response.css('.search-result-price').css('::text').extract(),
'square_meters_house' : response.css('.search-result-kenmerken span:nth-child(1)').css('::text').extract(),
'square_meters_property' : response.css('.search-result-kenmerken span+ span').css('::text').extract(),
'rooms' : response.css('.search-result-kenmerken li+ li').css('::text').extract(),
}
Pipeline:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3
class Funda1Pipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("funda.db")
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS funda_db""")
self.curr.execute("""create table funda_db(
zipcode text,
asking_price text,
square_meters_house text,
square_meters_property text,
rooms text
)""")
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self,item):
self.curr.execute("""insert into funda_db values (?,?,?,?,?)""",(
item['zipcode'][0],
item['asking_price'][0],
item['square_meters_house'][0],
item['square_meters_property'][0],
item['rooms'][0]
))
self.conn.commit()
items:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class Funda1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
zipcode = scrapy.Field()
asking_price = scrapy.Field()
square_meters_house = scrapy.Field()
square_meters_property = scrapy.Field()
rooms = scrapy.Field()
pass
settings:
ITEM_PIPELINES = {
'Funda1.pipelines.Funda1Pipeline': 300,
}
I think a function is missing that indicates the spider to repeat it for every line or somewhere in the code its shown that it has to stop after one line but I am not sure where. Anyone any idea?

Related

Problem with saving pickle object into arrays from images in python

I have the following class for loading and converting my images into train and test arrays for a deep learning model in Tensorflow 2.
The images are in three folders, named 'Car', 'Cat' and 'Man', which are within the Train and Test main folders. Each image is of 300 x 400 pixels.
import os
import pickle
import cv2
import numpy as np
os.getcwd()
out: 'C:\\Users\\me\\Jupiter_Notebooks\\Dataset_Thermal\\SeekThermal'
path_train = "../SeekThermal/Train"
path_test = "../SeekThermal/Test"
class MasterImage(object):
def __init__(self,PATH='', IMAGE_SIZE = 50):
self.PATH = PATH
self.IMAGE_SIZE = IMAGE_SIZE
self.image_data = []
self.x_data = []
self.y_data = []
self.CATEGORIES = []
# This will get List of categories
self.list_categories = []
def get_categories(self):
for path in os.listdir(self.PATH):
if '.DS_Store' in path:
pass
else:
self.list_categories.append(path)
print("Found Categories ",self.list_categories,'\n')
return self.list_categories
def process_image(self):
try:
"""
Return Numpy array of image
:return: X_Data, Y_Data
"""
self.CATEGORIES = self.get_categories()
for categories in self.CATEGORIES: # Iterate over categories
train_folder_path = os.path.join(self.PATH, categories) # Folder Path
class_index = self.CATEGORIES.index(categories) # this will get index for classification
for img in os.listdir(train_folder_path): # This will iterate in the Folder
new_path = os.path.join(train_folder_path, img) # image Path
try: # if any image is corrupted
image_data_temp = cv2.imread(new_path) # Read Image as numbers
image_temp_resize = cv2.resize(image_data_temp,(self.IMAGE_SIZE,self.IMAGE_SIZE))
self.image_data.append([image_temp_resize,class_index])
random.shuffle(self.image_data)
except:
pass
data = np.asanyarray(self.image_data) # or, data = np.asanyarray(self.image_data,dtype=object)
# Iterate over the Data
for x in data:
self.x_data.append(x[0]) # Get the X_Data
self.y_data.append(x[1]) # get the label
X_Data = np.asarray(self.x_data) / (255.0) # Normalize Data
Y_Data = np.asarray(self.y_data)
# reshape x_Data
X_Data = X_Data.reshape(-1, self.IMAGE_SIZE, self.IMAGE_SIZE, 3)
return X_Data, Y_Data
except:
print("Failed to run Function Process Image ")
def pickle_image(self):
"""
:return: None Creates a Pickle Object of DataSet
"""
# Call the Function and Get the Data
X_Data,Y_Data = self.process_image()
# Write the Entire Data into a Pickle File
pickle_out = open('X_Data','wb')
pickle.dump(X_Data, pickle_out)
pickle_out.close()
# Write the Y Label Data
pickle_out = open('Y_Data', 'wb')
pickle.dump(Y_Data, pickle_out)
pickle_out.close()
print("Pickled Image Successfully ")
return X_Data,Y_Data
def load_dataset(self):
try:
# Read the Data from Pickle Object
X_Temp = open('..\SeekThermal\X_Data','rb')
X_Data = pickle.load(X_Temp)
Y_Temp = open('..\SeekThermal\Y_Data','rb')
Y_Data = pickle.load(Y_Temp)
print('Reading Dataset from Pickle Object')
return X_Data,Y_Data
except:
print('Could not Found Pickle File ')
print('Loading File and Dataset ..........')
X_Data,Y_Data = self.pickle_image()
return X_Data,Y_Data
I dont understand what the problem is with the pickle file, because just last week I able to create these arrays successfully with the same code??
Is there an easier way to load images in Tensorflow rather than through the custom defined class?
a = MasterImage(PATH = path_train,IMAGE_SIZE = 224)
a.process_image()
out:
it produces an array with a warning.
VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
data = np.asanyarray(self.image_data)
a.pickle_image()
out:
TypeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1692\507657192.py in <cell line: 1>()
----> 1 a.pickle_image()
~\AppData\Local\Temp\ipykernel_1692\1410849712.py in pickle_image(self)
71 """
72 # Call the Function and Get the Data
---> 73 X_Data,Y_Data = self.process_image()
74
75 # Write the Entire Data into a Pickle File
TypeError: cannot unpack non-iterable NoneType object
a.load_dataset()
out:
Could not Found Pickle File
Loading File and Dataset ..........
Found Categories ['Car', 'Cat', 'Man', 'Car', 'Cat', 'Man']
Pickled Image Successfully
I'm running Python 3.8.8 via anaconda on Windows 10. Thank you for any advice.

Sentiment140 Preprocessing

I have been trying to do some preprocessing on the Sentiment140 database on Kaggle: https://www.kaggle.com/kazanova/sentiment140
The code I'm using is this:
import os
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
print('Corpus size: {}'.format(len(corpus)))
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
tokenized_corpus = []
for i, tweet in enumerate(corpus):
tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('#')]
tokenized_corpus.append(tokens)
print(tokenized_corpus)
However, I keep getting this error:
TypeError: '_io.TextIOWrapper' object is not subscriptable
Can anyone help me understand how to solve the issue?
Thanks in advance
TL;DR
To read .csv or structured datasets, use pandas https://pandas.pydata.org/ or any other dataframe libraries.
In Long:
Instead of doing:
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
You could simply read the .csv file with pandas, e.g.
import pandas as pd
corpus = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')
Then use the .apply() function to process the tweets:
"""
Columns
====
target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
"""
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
header=None,
names=['target', 'ids', 'date', 'flag', 'user', 'text'],
encoding='latin-1')
tokenizer = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
def process_tweet(tweet):
return [stemmer.stem(token) if not token.startswith('#') else token
for token in tokenizer.tokenize(tweet)]
# 1. Cast the column type to string
# 2. Lowercase it
# 3. Iterate throw each row and get the output from process_tweet()
# 4. # 3. Keep in a new column call `tokenized_text`
df['tokenized_text']= df['text'].str.lower().apply(process_tweet)

WatsonApiException: Error: invalid-api-key, Code: 401

I cant find Alchemy Language API in IBM Watson.
Can I do this with natural-language-understanding service and how?
When I add
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EntitiesOptions, KeywordsOptions
It shows some error with combined keyword
# In[]:
import tweepy
import re
import time
import math
import pandas as pd
from watson_developer_cloud import AlchemyLanguageV1
def initAlchemy():
al = AlchemyLanguageV1(api_key='GRYVUMdBbOtJXxNOIs1aopjjaiyOmLG7xJBzkAnvvwLh')
return al
def initTwitterApi():
consumer_key = 'OmK1RrZCVJSRmKxIuQqkBExvw'
consumer_secret = 'VWn6OR4rRgSi7qGnZHCblJMhrSvj1QbJmf0f62uX6ZQWZUUx5q'
access_token = '4852231552-adGooMpTB3EJYPHvs6oGZ40qlo3d2JbVjqUUWkJ'
access_token_secret = 'm9hgeM9p0r1nn8IoQWJYBs5qUQu56XmrAhsDSYKjuiVA4'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
return api
'''This function is implemented to handle tweepy exception errors
because search is rate limited at 180 queries per 15 minute window by twitter'''
def limit(cursor):
while True:
try:
yield cursor.next()
except tweepy.TweepError as error:
print(repr(error))
print("Twitter Request limit error reached sleeping for 15 minutes")
time.sleep(16*60)
except tweepy.RateLimitError:
print("Rate Limit Error occurred Sleeping for 16 minutes")
time.sleep(16*60)
def retrieveTweets(api, search, lim):
if(lim == ""):
lim = math.inf
else:
lim = int(lim)
text = []
for tweet in limit(tweepy.Cursor(api.search, q=search).items(limit = lim)):
t = re.sub('\s+', ' ', tweet.text)
text.append(t)
data = {"Tweet":text,
"Sentiment":"",
"Score":""}
dataFrame = pd.DataFrame(data, columns=["Tweet","Sentiment","Score"])
return dataFrame
def analyze(al,dataFrame):
sentiment = []
score = []
for i in range(0, dataFrame["Tweet"].__len__()):
res = al.combined(text=dataFrame["Tweet"][i],
extract="doc-sentiment",
sentiment=1)
sentiment.append(res["docSentiment"]["type"])
if(res["docSentiment"]["type"] == "neutral"):
score.append(0)
else:
score.append(res["docSentiment"]["score"])
dataFrame["Sentiment"] = sentiment
dataFrame["Score"] = score
return dataFrame
def main():
#Initialse Twitter Api
api = initTwitterApi()
#Retrieve tweets
dataFrame = retrieveTweets(api,input("Enter the search query (e.g. #hillaryclinton ) : "), input("Enter limit for number of tweets to be searched or else just hit enter : "))
#Initialise IBM Watson Alchemy Language Api
al = initAlchemy()
#Do Document Sentiment analysis
dataFrame = analyze(al, dataFrame)
#Save tweets, sentiment, and score data frame in csv file
dataFrame.to_csv(input("Enter the name of the file (with .csv extension) : "))
if __name__ == '__main__':
main()# -*- coding: utf-8 -*-
The Watson Natural Language Understanding only has a combined call, but since it is the only call, it isn't called combined, its actually analyze. Best place to go for details would be the API documentation - https://www.ibm.com/watson/developercloud/natural-language-understanding/api/v1/?python#post-analyze

Pyspark: filter contents of array inside row

In Pyspark, one can filter an array using the following code:
lines.filter(lambda line: "some" in line)
But I have read data from a json file and tokenized it. Now it has the following form:
df=[Row(text=u"i have some text", words=[u'I', u'have', u"some'", u'text'])]
How can I filter out "some" from words array ?
You can use array_contains, it's available since 1.4 :
from pyspark.sql import Row
from pyspark.sql import functions as F
df = sqlContext.createDataFrame([Row(text=u"i have some text", words=[u'I', u'have', u'some', u'text'])])
df.withColumn("keep", F.array_contains(df.words,"some")) \
.filter(F.col("keep")==True).show()
# +----------------+--------------------+----+
# | text| words|keep|
# +----------------+--------------------+----+
# |i have some text|[I, have, some, t...|true|
# +----------------+--------------------+----+
If you want to filter out 'some', like I said in the comment, you can use the StopWordsRemover API
from pyspark.ml.feature import StopWordsRemover
StopWordsRemover(inputCol="words", stopWords=["some"]).transform(df)

'self' not defined, jinja2, appengine

Error:
self.response.out.write(template.render(template_values)) NameError:
name 'self' is not defined
pertains to lines marked # ERROR, with other notes:
#!/usr/bin/env python27
import cgi
import webapp2
import jinja2
import time
import datetime
import urllib
#import cgitb; cgitb.enable()
import os
from google.appengine.ext import db
from google.appengine.api import users
from google.appengine.api import memcache
jinja_environment = jinja2.Environment(autoescape=True,
loader=jinja2.FileSystemLoader(os.path.join(os.path.dirname(__file__), 'templates')))
class Visitor(db.Model): # I still need this with jinja2, yes?
name = db.StringProperty(required=1)
mood = db.StringProperty(choices=["good","bad","fair"])
date = db.DateTimeProperty(auto_now_add=True)
class MainPage(webapp2.RequestHandler):
def get(self): # ERROR HERE
visitor_query = Visitor.all().order('-date') #not sure about query...need to get curent visitor's submitted form values (name, mood). no log-in in app.
visitor = visitor_query.fetch(1)
template_values = {
'visitor': visitor,
'url': url, #not sure how this applies, just following tutorial
'url_linktext': url_linktext,
}
localtime = time.localtime(time.time())
mon = localtime[1] # MONTH
h = localtime[3] # HOUR
span = "morning" if h == range(5,14) else "afternoon" if h == range(17,7) else "evening"
if mon <= 3:
var1 = "winter"
# more variables in if/elif statement here...I call these variables from index.html...
# name = self.request.get("name") # not sure if I need to define these variables here using jinja2...tutorial does not define entity properties in example.
# name = name.capitalize()
# mood = self.request.get("mood")
template = jinja_environment.get_template('index.html')
self.response.out.write(template.render(template_values)) # ERROR HERE
class Process(webapp2.RequestHandler):
def post(self):
name = self.request.get("name")
name = name.capitalize()
mood = self.request.get("mood")
message = Visitor(name=name, mood=mood)
if users.get_current_user():
message.name = users.get_current_user() #not sure if I need users.get_current...no log-in required
message.mood = self.request.get("mood")
message.put()
self.redirect("/")
app = webapp2.WSGIApplication([('/', MainPage)],
debug=True)
app.yaml:
application: emot
version: 1
runtime: python27
api_version: 1
threadsafe: true
handlers:
#- url: /stylesheets/ # I read no static files allowed with jinja2...not sure how I'll handle CSS...
# static_dir: stylesheets
- url: /.*
script: main.app
libraries:
- name: jinja2
version: latest
index.yaml (all of this works without jinja2...)
indexes:
- kind: Visitor
ancestor: yes
properties:
- name: name
- name: mood
- name: date
direction: desc
Also, I have alternately copied (not cut) jinja2 folder from g00gle_appengine/lib directory to my app directory folder, including just copying the "jinja" folder (as similar method worked using gdata atom & src...) I have also installed python-jinja2, which is located at: /usr/share/doc/python-jinja2
My index.html is in directory "templates" in my app directory. Thanks in advance for getting me going.
From the code you've posted, it looks like the erroring line of code (and the preceding few) aren't indented far enough.
The get method should be aligned as follows:
def get(self): # ERROR HERE
visitor_query = Visitor.all().order('-date') #not sure about query...need to get curent visitor's submitted form values (name, mood). no log-in in app.
visitor = visitor_query.fetch(1)
template_values = {
'visitor': visitor,
'url': url, #not sure how this applies, just following tutorial
'url_linktext': url_linktext,
}
localtime = time.localtime(time.time())
mon = localtime[1] # MONTH
h = localtime[3] # HOUR
span = "morning" if h == range(5,14) else "afternoon" if h == range(17,7) else "evening"
if mon <= 3:
var1 = "winter"
# more variables in if/elif statement here...I call these variables from index.html...
# name = self.request.get("name") # not sure if I need to define these variables here using jinja2...tutorial does not define entity properties in example.
# name = name.capitalize()
# mood = self.request.get("mood")
template = jinja_environment.get_template('index.html')
self.response.out.write(template.render(template_values)) # ERROR HERE

Resources