I'm trying to use the following script, but instead of typing in the URL, I want it to loop and pull the URLs from Links.CSV file. Eventually I want to export all the results into a new CSV file.
import csv
import requests
from BeautifulSoup import BeautifulSoup
from urllib import urlopen
url = (LINK)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
Title = soup.find(id="productTitle")
Price = soup.find(id="priceblock_ourprice")
print Title.text, Price.text
Any help would be appreciated.
Related
So I want to take my mongo db and use it in react, its small so it won't overpower react. My Flask looks like this
import subprocess
from flask import Flask, request, jsonify, json
from flask_cors import CORS, cross_origin
import pymongo
disaster = ""
app = Flask(__name__)
CORS(app, support_credentials=True)
client = pymongo.MongoClient("NOT IMPORTANT FOR GITHUB")
db = client["twitterdb"]
col = db["tweets"]
our_mongo_database_compressed = col.find({},{'user.created_at':1, 'user.location':1,'_id':0})
def request_tweets(disaster):
print(disaster)
#subprocess.call("./../../../backend/get_tweets", disaster)
#app.route('/refresh_data', methods=['GET', 'POST'])
##cross_origin(supports_credentials=True)
def refresh_data():
disaster = request.get_json()
request_tweets(disaster)
x = 0
y = []
for datas in our_mongo_database_compressed:
y.append(datas)
if(x > 100):
break
x+=1
#print(y)
return str(y)
and my react function looks like
this.setState({
disaster: event.target.value
})
axios.post('http://localhost:5000/refresh_data', [this.state.disaster])
.then(function(response){
console.log(JSON.parse(response.data));
})
}
I keep getting a " Unexpected token ' in JSON at position 2" and I just want the data to be sent to react
So I figured it out and I want anyone who has this problem in the future to be able to see this.
for datas in our_mongo_database_compressed:
y.append(datas)
This creates an array of dictionaries. so y[0]["user"]["location"] would be how you'd get an element from this array.
With that in mind we need to change this array to a JSON String which is a data type that you use to transfer between flask and react so you'd return
return json.dumps(y)
Now you might think this means that you got a string in React when you write
JSON.parse(response.data)
NO. That would be too easy, you have a response object that secretly a string. So you'd need to change the response object into a string with JSON stringify
JSON.parse(JSON.stringify(response.data))
Now you have your json in react
I want to collect articles from this particular website. I was using Beautifulsoup only earlier but it was not grabbing the links. So I tried to use selenium. Now I tried to write this code. This is giving output 'None'. I have never used selenium before, so I don't have much idea about it. What should I change in this code to make it work and give the desired results?
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
base = 'https://metro.co.uk'
url = 'https://metro.co.uk/search/#gsc.tab=0&gsc.q=cybersecurity&gsc.sort=date&gsc.page=7'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get(url)
link = browser.find_elements_by_class_name('gs-title')
for links in link:
links.get_attribute('href')
soup = BeautifulSoup(browser.page_source, 'lxml')
date = soup.find('span', {'class': 'post-date'})
title = soup.find('h1', {'class':'headline'})
content = soup.find('div',{'class':'article-body'})
print(date)
print(title)
print(content)
time.sleep(3)
browser.close()
I want to collect the date, title, and content from all the articles on this page and other pages also like page no 7 to 18.
Thank you.
Instead of using Selenium to get the anchors, I tried to extract the page source first with the help of Selenium and then used Beautiful Soup on it.
So, to put it in perspective:
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
base = 'https://metro.co.uk'
url = 'https://metro.co.uk/search/#gsc.tab=0&gsc.q=cybersecurity&gsc.sort=date&gsc.page=7'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
#wait = WebDriverWait(browser, 10) #Not actually required
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser') #Get the Page Source
anchors = soup.find_all("a", class_ = "gs-title") #Now find the anchors
for anchor in anchors:
browser.get(anchor['href']) #Connect to the News Link, and extract it's Page Source
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
date = sub_soup.find('span', {'class': 'post-date'})
title = sub_soup.find('h1', {'class':'post-title'}) #Note that the class attribute for the heading is 'post-title' and not 'headline'
content = sub_soup.find('div',{'class':'article-body'})
print([date.string, title.string, content.string])
#time.sleep(3) #Even this I don't believe is required
browser.close()
With this modification, I believe you can get your required contents.
You can use same API as page uses. Alter parameters to get all pages of results
import requests
import json
import re
r = requests.get('https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&gss=.uk&start=60&cselibv=5d7bf4891789cfae&cx=012545676297898659090:wk87ya_pczq&q=cybersecurity&safe=off&cse_tok=AKaTTZjKIBzl-5fANH8dQ8f78cv2:1560500563340&filter=0&sort=date&exp=csqr,4229469&callback=google.search.cse.api3732')
p = re.compile(r'api3732\((.*)\);', re.DOTALL)
data = json.loads(p.findall(r.text)[0])
links = [item['clicktrackUrl'] for item in data['results']]
print(links)
I saw codes and I would like to try/implement it to a web page to put some datas. (you can see it on the image below)
[enter image description here][1]
import requests
from bs4 import BeautifulSoup
import time
def get_count():
url = "https://data-live.flightradar24.com/zones/fcgi/feed.js?bounds=59.09,52.64,-58.77,-47.71&faa=1&mlat=1&flarm=1&adsb=1&gnd=1&air=1&vehicles=1&estimated=1&maxage=7200&gliders=1&stats=1"
# Request with fake header, otherwise you will get an 403 HTTP error
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
# Parse the JSON
data = r.json()
counter = 0
# Iterate over the elements to get the number of total flights
for element in data["stats"]["total"]:
counter += data["stats"]["total"][element]
return counter
while True:
print(get_count())
time.sleep(8)
I am just starting to learn web scraping with beautiful soup.
Ok guys I am having tons of problems getting my working dev server to a working production server :). I have a task that will go through and request urls and collect and update data. It takes 30 minutes to run.
I uploaded to production server and going to the url with its corresponding .py script appname.appspot.com/tasks/rrs after 30 seconds I am getting the class google.appengine.runtime.DeadlineExceededError' Is there any way to get around this? Is this a 30 second deadline for a page? This script works fine in development server I go to the url and the associate .py script runs until completion.
import time
import random
import string
import cPickle
from StringIO import StringIO
try:
import json
except ImportError:
import simplejson as json
import urllib
import pprint
import datetime
import sys
sys.path.append("C:\Program Files (x86)\Google\google_appengine")
sys.path.append("C:\Program Files (x86)\Google\google_appengine\lib\yaml\lib")
sys.path.append("C:\Program Files (x86)\Google\google_appengine\lib\webob")
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext import db
class SR(db.Model):
name = db.StringProperty()
title = db.StringProperty()
url = db.StringProperty()
##request url and returns JSON_data
def overview(page):
u = urllib.urlopen(page)
bytes = StringIO(u.read())
##print bytes
u.close()
try:
JSON_data = json.load(bytes)
return JSON_data
except ValueError,e:
print e," Couldn't get .json for %s" % page
return None
##specific code to parse particular JSON data and append new SR objects to the given url list
def parse_json(JSON_data,lists):
sr = SR()
sr.name = ##data gathered
sr.title = ##data gathered
sr.url = ##data gathered
lists.append(sr)
return lists
## I want to be able to request lets say 500 pages without timeing out
page = 'someurlpage.com'##starting url
url_list = []
for z in range(0,500):
page = 'someurlpage.com/%s'%z
JSON_data = overview(page)##get json data for a given url page
url_list = parse_json(JSON_data,url_list)##parse the json data and append class objects to a given list
db.put(url_list)##finally add object to gae database
Yes, the App Engine imposes a 30 seconds deadline. One way around it might be a try/except DeadlineExceededError and putting the rest in a taskqueue.
But you can't make your requests run for a longer period.
You can also try Bulkupdate
Example:
class Todo(db.Model):
page = db.StringProperty()
class BulkPageParser(bulkupdate.BulkUpdater):
def get_query(self):
return Todo.all()
def handle_entity(self, entity):
JSON_data = overview(entity.page)
db.put(parse_json(JSON_data, [])
entity.delete()
# Put this in your view code:
for i in range(500):
Todo(page='someurlpage.com/%s' % i).put()
job = BulkPageParser()
job.start()
ok so if I am dynamically adding links as I am parsing the pages, I would add to the todo queue like so I believe.
def handle_entity(self, entity):
JSON_data = overview(entity.page)
data_gathered,new_links = parse_json(JSON_data, [])##like earlier returns the a list of sr objects, and now a list of new links/pages to go to
db.put(data_gathered)
for link in new_links:
Todo(page=link).put()
entity.delete()
After a lot of struggle trying to crawl an angular Js page with Single sign-on I have put up this code. This code runs fine, logins opens the desired page and scraps it but I am not getting all the links and text present in the website loaded by angular. My xpath seems to be correct.
Also it is not crawling the links that are getting extracted. What do I need to change in my code to extract all text present in the website and the subsequent webpages?
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ps_crawler.items import PsCrawlerItem
import time
from selenium.webdriver.common.keys import Keys
class SISSpider(scrapy.Spider):
name = "SIS"
allowed_domains = ["domain.com"]
start_urls = ["https://domain.com/login?"]
def __init__(self):
self.driver = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get("https://domain.com/login?")
time.sleep(5)
self.driver.find_element_by_xpath('//*[#id="Login"]/div[2]/div[1]/div[2]/form/div[1]/input').send_keys("ssasdad")
self.driver.find_element_by_xpath('//*[#id="Login"]/div[2]/div[1]/div[2]/form/div[2]/input').send_keys("")
#self.driver.find_element_by_xpath('//*[#id="login"]').click()
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.XPATH, '//*[#id="login"]'))
)
time.sleep(5)
more_btn.click()
time.sleep(5)
self.driver.execute_script("window.open('https://domain.com/#/admin','_blank');");
time.sleep(10)
window_now = self.driver.window_handles[1]
self.driver.switch_to_window(window_now)
## stop when we reach the desired page
#if self.driver.current_url.endswith('page=20'):
# break
#now scrapy should do the job
time.sleep(10)
response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
time.sleep(10)
for post in response.xpath('//div'):
item = PsCrawlerItem()
print post.xpath('a/span/text()').extract(), post.xpath('a/#href').extract(), post.xpath('a/#ng-href').extract()
You just need to tweak your xpath a little bit as follows. Hope this solves the problem.
for post in response.xpath('//body'):
print post.xpath('//text()').extract(), post.xpath('//a//#href').extract()