how to use web scraped data to webpage - database

I saw codes and I would like to try/implement it to a web page to put some datas. (you can see it on the image below)
[enter image description here][1]
import requests
from bs4 import BeautifulSoup
import time
def get_count():
url = "https://data-live.flightradar24.com/zones/fcgi/feed.js?bounds=59.09,52.64,-58.77,-47.71&faa=1&mlat=1&flarm=1&adsb=1&gnd=1&air=1&vehicles=1&estimated=1&maxage=7200&gliders=1&stats=1"
# Request with fake header, otherwise you will get an 403 HTTP error
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
# Parse the JSON
data = r.json()
counter = 0
# Iterate over the elements to get the number of total flights
for element in data["stats"]["total"]:
counter += data["stats"]["total"][element]
return counter
while True:
print(get_count())
time.sleep(8)
I am just starting to learn web scraping with beautiful soup.

Related

Unable to get Selenium Code Working to POST data to Court (PACER) Practice/Training Site

I have been trying to create a script to do the following while I have access to a Training Web-site I am using for electronic case filings. This requires the following steps:
Going to the following site: https://ecf-train.nvb.uscourts.gov/
clicking the html link at this site on this link: "District of Nevada Train Database - Document Filing System"
This then redirects me to this site, where I POST my login credentials (username/pw/testclientcode/flag=1): (https://train-login.uscourts.gov/csologin/login.jsf?pscCourtId=NVTBK&appurl=https://ecf-train.nvb.uscourts.gov/cgi-bin/login.pl)
the "flag" checks a "redaction" clicks in and then is supposed to land me or give me the main access page to select what I want to e-file in the testing system (this is all approved by the Court for testing, FYI). I am attaching a screenshot of what it should look like when logged in successfully, so I can then click the "Bankruptcy" header link ECF Header Links, which then brings me to the next target page (https://ecf-train.nvb.uscourts.gov/cgi-bin/DisplayMenu.pl?BankruptcyEvents&id=1227536), where I can select the link for "Case Upload" (see attached screenshot for 'CaseUpload')Case Upload Selection Link
I am new to Python, but I believe I took all the correct steps to download the Selenium Chrome Browser and install Selenium using "pip install selenium". However, my code is not finding the webdriver "PATH", which is a local desktop directory saved directly on my "C:" drive.
Any help getting this work is HUGELY appreciated. Here is the code I have been using python with the errors as well:
CODE:
import requests
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import sys
import os
import json
import time`
# pass in header info which is "Content-Type:application/json"
url = "https://ecf-train.nvb.uscourts.gov/"
# payload contains the header information for Conent Type and Accept
payload = {"Content-Type":"application/json", "Accept":"application/json"}
r = requests.post(url, data=payload)
pacer_target = requests.get(url)
pacer_target.text
print(r.text)
# defining the api-endpoint
API_ENDPOINT = 'url'
requests.get(API_ENDPOINT.text)
# data to be sent to api - I removed by userid and pw from this post
data = {
"loginId":"XXXXXXXXX",
"password":"XXXXXXXXX",
"clientCode":"anycode",
"redactFlag":"1"}
# sending post request and saving response as response object
r = requests.post(url = API_ENDPOINT, data = data)
print(r.text, "STEP 1 - logged into main Traing screen!!")
# Import Selenium Executable File for Chrome from C:\\ drive on my desktop and use the location path
PATH = 'C:\chromedriver.exe'
driver = webdriver.Chrome(PATH)
# browser = webdriver.Chrome()
continue_link = browser.find_element(By.PARTIAL_LINK_TEXT, 'cgi-bin/login')
continue_link.click()
# get response from the clicked URL and print the response
response = requests.get(url)
print(url.text, "this is step 2 - just clicked the main login button!!!!!")
# sending post request and saving response as response object
r = requests.post(url = API_ENDPOINT, data = data)
# extracting response text
pastebin_url = r.text
print("The pastebin URL is:%s"%pastebin_url)

Sending an array as json Flask to React.js

I am using flask as a backend as a server for OpenCV's facial detection module. I cannot seem to access the x and y coordinates of the face and send them to the react.js front end. I am new to the flask, this is the error I am receiving:
Uncaught (in promise) SyntaxError: Unexpected token < in JSON at position 0.
I believe this is because the flask server is not sending readable JSON to react.js, but I am sure I am doing other things improperly. I am wondering how exactly I can save data into an array and send it to react in a JSON format.
camera.py:
import cv2
import json
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
class VideoCamera(object):
def __init__(self):
self.video = cv2.VideoCapture(0)
#must delete webcam otherwise the webcam will become "trapped" and you cannot use it for other apps until the computer restarts
def __del__(self):
self.video.release()
# geting the frame and converting it to jpeg format
def get_frame(self):
success, frame = self.video.read()
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
cv2.rectangle(frame,(x,y),(x+w,y+h),(255,0,0),3)
break #exit loop after rendering one face
ret, jpeg = cv2.imencode('.jpg', frame)
return jpeg.tobytes()
def get_axis(self):
success, frame = self.video.read()
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
axisArr = []
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
axisArr = [x,y,w,h]
break #exit loop after rendering one face
return json.dumps(axisArr)
server.py:
from flask import Flask, render_template, Response
from camera import VideoCamera
app = Flask(__name__, template_folder='../../frontend/src')
def index():
return render_template('index.html')
def setAxis(axisGen):
global axis
axis = axisGen
def gen(camera):
while True:
frame = camera.get_frame()
axisGen = camera.get_axis()
setAxis(axisGen)
# creates html render template
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')
#app.route('/video_feed')
def video_feed():
return Response(gen(VideoCamera()),
mimetype='multipart/x-mixed-replace; boundary=frame') #explains the type of response the html is receiving and sending it to the browser
#app.route('/members')
def members():
return {"members": axis}
# initialize the server to run the flask app on, debug mode means you do not need to close and reopen the server to see changes
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port='4999')

How do I take a mongodb json to flask to react?

So I want to take my mongo db and use it in react, its small so it won't overpower react. My Flask looks like this
import subprocess
from flask import Flask, request, jsonify, json
from flask_cors import CORS, cross_origin
import pymongo
disaster = ""
app = Flask(__name__)
CORS(app, support_credentials=True)
client = pymongo.MongoClient("NOT IMPORTANT FOR GITHUB")
db = client["twitterdb"]
col = db["tweets"]
our_mongo_database_compressed = col.find({},{'user.created_at':1, 'user.location':1,'_id':0})
def request_tweets(disaster):
print(disaster)
#subprocess.call("./../../../backend/get_tweets", disaster)
#app.route('/refresh_data', methods=['GET', 'POST'])
##cross_origin(supports_credentials=True)
def refresh_data():
disaster = request.get_json()
request_tweets(disaster)
x = 0
y = []
for datas in our_mongo_database_compressed:
y.append(datas)
if(x > 100):
break
x+=1
#print(y)
return str(y)
and my react function looks like
this.setState({
disaster: event.target.value
})
axios.post('http://localhost:5000/refresh_data', [this.state.disaster])
.then(function(response){
console.log(JSON.parse(response.data));
})
}
I keep getting a " Unexpected token ' in JSON at position 2" and I just want the data to be sent to react
So I figured it out and I want anyone who has this problem in the future to be able to see this.
for datas in our_mongo_database_compressed:
y.append(datas)
This creates an array of dictionaries. so y[0]["user"]["location"] would be how you'd get an element from this array.
With that in mind we need to change this array to a JSON String which is a data type that you use to transfer between flask and react so you'd return
return json.dumps(y)
Now you might think this means that you got a string in React when you write
JSON.parse(response.data)
NO. That would be too easy, you have a response object that secretly a string. So you'd need to change the response object into a string with JSON stringify
JSON.parse(JSON.stringify(response.data))
Now you have your json in react

How to get links using selenium and scrape using beautifulsoup?

I want to collect articles from this particular website. I was using Beautifulsoup only earlier but it was not grabbing the links. So I tried to use selenium. Now I tried to write this code. This is giving output 'None'. I have never used selenium before, so I don't have much idea about it. What should I change in this code to make it work and give the desired results?
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
base = 'https://metro.co.uk'
url = 'https://metro.co.uk/search/#gsc.tab=0&gsc.q=cybersecurity&gsc.sort=date&gsc.page=7'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get(url)
link = browser.find_elements_by_class_name('gs-title')
for links in link:
links.get_attribute('href')
soup = BeautifulSoup(browser.page_source, 'lxml')
date = soup.find('span', {'class': 'post-date'})
title = soup.find('h1', {'class':'headline'})
content = soup.find('div',{'class':'article-body'})
print(date)
print(title)
print(content)
time.sleep(3)
browser.close()
I want to collect the date, title, and content from all the articles on this page and other pages also like page no 7 to 18.
Thank you.
Instead of using Selenium to get the anchors, I tried to extract the page source first with the help of Selenium and then used Beautiful Soup on it.
So, to put it in perspective:
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
base = 'https://metro.co.uk'
url = 'https://metro.co.uk/search/#gsc.tab=0&gsc.q=cybersecurity&gsc.sort=date&gsc.page=7'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
#wait = WebDriverWait(browser, 10) #Not actually required
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser') #Get the Page Source
anchors = soup.find_all("a", class_ = "gs-title") #Now find the anchors
for anchor in anchors:
browser.get(anchor['href']) #Connect to the News Link, and extract it's Page Source
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
date = sub_soup.find('span', {'class': 'post-date'})
title = sub_soup.find('h1', {'class':'post-title'}) #Note that the class attribute for the heading is 'post-title' and not 'headline'
content = sub_soup.find('div',{'class':'article-body'})
print([date.string, title.string, content.string])
#time.sleep(3) #Even this I don't believe is required
browser.close()
With this modification, I believe you can get your required contents.
You can use same API as page uses. Alter parameters to get all pages of results
import requests
import json
import re
r = requests.get('https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&gss=.uk&start=60&cselibv=5d7bf4891789cfae&cx=012545676297898659090:wk87ya_pczq&q=cybersecurity&safe=off&cse_tok=AKaTTZjKIBzl-5fANH8dQ8f78cv2:1560500563340&filter=0&sort=date&exp=csqr,4229469&callback=google.search.cse.api3732')
p = re.compile(r'api3732\((.*)\);', re.DOTALL)
data = json.loads(p.findall(r.text)[0])
links = [item['clicktrackUrl'] for item in data['results']]
print(links)

<class 'google.appengine.runtime.DeadlineExceededError'>: how to get around?

Ok guys I am having tons of problems getting my working dev server to a working production server :). I have a task that will go through and request urls and collect and update data. It takes 30 minutes to run.
I uploaded to production server and going to the url with its corresponding .py script appname.appspot.com/tasks/rrs after 30 seconds I am getting the class google.appengine.runtime.DeadlineExceededError' Is there any way to get around this? Is this a 30 second deadline for a page? This script works fine in development server I go to the url and the associate .py script runs until completion.
import time
import random
import string
import cPickle
from StringIO import StringIO
try:
import json
except ImportError:
import simplejson as json
import urllib
import pprint
import datetime
import sys
sys.path.append("C:\Program Files (x86)\Google\google_appengine")
sys.path.append("C:\Program Files (x86)\Google\google_appengine\lib\yaml\lib")
sys.path.append("C:\Program Files (x86)\Google\google_appengine\lib\webob")
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext import db
class SR(db.Model):
name = db.StringProperty()
title = db.StringProperty()
url = db.StringProperty()
##request url and returns JSON_data
def overview(page):
u = urllib.urlopen(page)
bytes = StringIO(u.read())
##print bytes
u.close()
try:
JSON_data = json.load(bytes)
return JSON_data
except ValueError,e:
print e," Couldn't get .json for %s" % page
return None
##specific code to parse particular JSON data and append new SR objects to the given url list
def parse_json(JSON_data,lists):
sr = SR()
sr.name = ##data gathered
sr.title = ##data gathered
sr.url = ##data gathered
lists.append(sr)
return lists
## I want to be able to request lets say 500 pages without timeing out
page = 'someurlpage.com'##starting url
url_list = []
for z in range(0,500):
page = 'someurlpage.com/%s'%z
JSON_data = overview(page)##get json data for a given url page
url_list = parse_json(JSON_data,url_list)##parse the json data and append class objects to a given list
db.put(url_list)##finally add object to gae database
Yes, the App Engine imposes a 30 seconds deadline. One way around it might be a try/except DeadlineExceededError and putting the rest in a taskqueue.
But you can't make your requests run for a longer period.
You can also try Bulkupdate
Example:
class Todo(db.Model):
page = db.StringProperty()
class BulkPageParser(bulkupdate.BulkUpdater):
def get_query(self):
return Todo.all()
def handle_entity(self, entity):
JSON_data = overview(entity.page)
db.put(parse_json(JSON_data, [])
entity.delete()
# Put this in your view code:
for i in range(500):
Todo(page='someurlpage.com/%s' % i).put()
job = BulkPageParser()
job.start()
ok so if I am dynamically adding links as I am parsing the pages, I would add to the todo queue like so I believe.
def handle_entity(self, entity):
JSON_data = overview(entity.page)
data_gathered,new_links = parse_json(JSON_data, [])##like earlier returns the a list of sr objects, and now a list of new links/pages to go to
db.put(data_gathered)
for link in new_links:
Todo(page=link).put()
entity.delete()

Resources