I want to collect articles from this particular website. I was using Beautifulsoup only earlier but it was not grabbing the links. So I tried to use selenium. Now I tried to write this code. This is giving output 'None'. I have never used selenium before, so I don't have much idea about it. What should I change in this code to make it work and give the desired results?
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
base = 'https://metro.co.uk'
url = 'https://metro.co.uk/search/#gsc.tab=0&gsc.q=cybersecurity&gsc.sort=date&gsc.page=7'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get(url)
link = browser.find_elements_by_class_name('gs-title')
for links in link:
links.get_attribute('href')
soup = BeautifulSoup(browser.page_source, 'lxml')
date = soup.find('span', {'class': 'post-date'})
title = soup.find('h1', {'class':'headline'})
content = soup.find('div',{'class':'article-body'})
print(date)
print(title)
print(content)
time.sleep(3)
browser.close()
I want to collect the date, title, and content from all the articles on this page and other pages also like page no 7 to 18.
Thank you.
Instead of using Selenium to get the anchors, I tried to extract the page source first with the help of Selenium and then used Beautiful Soup on it.
So, to put it in perspective:
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
base = 'https://metro.co.uk'
url = 'https://metro.co.uk/search/#gsc.tab=0&gsc.q=cybersecurity&gsc.sort=date&gsc.page=7'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
#wait = WebDriverWait(browser, 10) #Not actually required
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'html.parser') #Get the Page Source
anchors = soup.find_all("a", class_ = "gs-title") #Now find the anchors
for anchor in anchors:
browser.get(anchor['href']) #Connect to the News Link, and extract it's Page Source
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
date = sub_soup.find('span', {'class': 'post-date'})
title = sub_soup.find('h1', {'class':'post-title'}) #Note that the class attribute for the heading is 'post-title' and not 'headline'
content = sub_soup.find('div',{'class':'article-body'})
print([date.string, title.string, content.string])
#time.sleep(3) #Even this I don't believe is required
browser.close()
With this modification, I believe you can get your required contents.
You can use same API as page uses. Alter parameters to get all pages of results
import requests
import json
import re
r = requests.get('https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&gss=.uk&start=60&cselibv=5d7bf4891789cfae&cx=012545676297898659090:wk87ya_pczq&q=cybersecurity&safe=off&cse_tok=AKaTTZjKIBzl-5fANH8dQ8f78cv2:1560500563340&filter=0&sort=date&exp=csqr,4229469&callback=google.search.cse.api3732')
p = re.compile(r'api3732\((.*)\);', re.DOTALL)
data = json.loads(p.findall(r.text)[0])
links = [item['clicktrackUrl'] for item in data['results']]
print(links)
Related
I need code snippet which can perform "Add extension" click.
[here is the action need to be performed.][1]
[1]: https://i.stack.imgur.com/P3R0X.png
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
browser = webdriver.Edge("./msedgedriver")
browser.get(
"https://microsoftedge.microsoft.com/addons/detail/adblock-%E2%80%94-best-ad-blocker/ndcileolkflehcjpmjnfbnaibdcgglog"
)
time.sleep(5)
ele = browser.find_element(
By.XPATH, value="/html/body/div[2]/div[3]/div/div/div/div[2]/div[3]/div/button"
)
time.sleep(2)
ele.click()
time.sleep(5)
browser.quit()
I know to install it using CRX file. So, I don't need this method.
I have create a simple bot that at the entry of the betfair website will simply click the accept cookies button. Its been working for months but now all of a sudden once entered the page, it will just keep loading without performing the action. Any idea why?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
url = r"C:\Users\salde\Desktop\chromedriver_win32 (1)\chromedriver.exe"
driver = webdriver.Chrome(executable_path=url)
##website to navigate to
driver.get('https://www.betfair.com/exchange/plus/')
##to accept cookies at the entry to the website
element = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, '//*[#id="onetrust-accept-btn-handler"]'))
)
element.click()
time.sleep(2)
btnCookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
btnCookies.click()
As you've defined them, your variables element and btnCookies target the same element, and you shouldn't need to click it twice. (In fact the second time, the element is now longer displayed, which is why you get the element not interactable error. At least try cleaning up your code to remove these last few lines and see what happens:
time.sleep(2)
btnCookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
btnCookies.click()
As I am scraping, the page is dynamic with the 'load more' button.
I used selenium for that.
The first problem is that it is only working only one time. means clicking load more button the only first time.
The second problem is that it is scraping only the articles that are before the first load more button. Not scraping after that.
The third problem is that it is scraping all the articles twice.
The fourth problem is I only want the date but it is giving along with the date, the author and place also.
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
base = "https://indianexpress.com"
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get('https://indianexpress.com/?s=cybersecurity')
while True:
try:
time.sleep(6)
show_more = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Load More')))
show_more.click()
except Exception as e:
print(e)
break
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'id':'ie-infinite-scroll'})
links = search_results.find_all('a')
for link in links:
link_url = link['href']
response = requests.get(link_url)
sauce = BeautifulSoup(response.text, 'html.parser')
dateTag = sauce.find('div', {'class':'m-story-meta__credit'})
titleTag = sauce.find('h1', {'class':'m-story-header__title'})
contentTag = ' '.join([item.get_text(strip=True) for item in sauce.select("[class^='o-story-content__main a-wysiwyg'] p")])
date = None
title = None
content = None
if isinstance(dateTag, Tag):
date = dateTag.get_text().strip()
if isinstance(titleTag, Tag):
title = titleTag.get_text().strip()
print(f'{date}\n {title}\n {contentTag}\n')
time.sleep(3)
There is no error in this code. But it needs refinement. What should I do to solve above-mentioned problems?
Thanks.
Because you are not waiting the new content. While the new content is waiting to loading, you are trying to click to the 'load more' button.
Error message:
Message: Element <a class="m-featured-link m-featured-link--centered ie-load-more" href="#"> is not clickable at point (467,417) because another element <div class="o-listing__load-more m-loading"> obscures it
My solution:
while True:
try:
wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(#class, 'ie-load-more')]")))
browser.find_element_by_xpath("//a[contains(#class, 'ie-load-more')]").click()
wait.until(EC.visibility_of_element_located((By.XPATH,"//div[#class='o-listing__load-more']")))
except Exception as e:
print(e)
break
I'm trying to use the following script, but instead of typing in the URL, I want it to loop and pull the URLs from Links.CSV file. Eventually I want to export all the results into a new CSV file.
import csv
import requests
from BeautifulSoup import BeautifulSoup
from urllib import urlopen
url = (LINK)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
Title = soup.find(id="productTitle")
Price = soup.find(id="priceblock_ourprice")
print Title.text, Price.text
Any help would be appreciated.
After a lot of struggle trying to crawl an angular Js page with Single sign-on I have put up this code. This code runs fine, logins opens the desired page and scraps it but I am not getting all the links and text present in the website loaded by angular. My xpath seems to be correct.
Also it is not crawling the links that are getting extracted. What do I need to change in my code to extract all text present in the website and the subsequent webpages?
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ps_crawler.items import PsCrawlerItem
import time
from selenium.webdriver.common.keys import Keys
class SISSpider(scrapy.Spider):
name = "SIS"
allowed_domains = ["domain.com"]
start_urls = ["https://domain.com/login?"]
def __init__(self):
self.driver = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get("https://domain.com/login?")
time.sleep(5)
self.driver.find_element_by_xpath('//*[#id="Login"]/div[2]/div[1]/div[2]/form/div[1]/input').send_keys("ssasdad")
self.driver.find_element_by_xpath('//*[#id="Login"]/div[2]/div[1]/div[2]/form/div[2]/input').send_keys("")
#self.driver.find_element_by_xpath('//*[#id="login"]').click()
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.XPATH, '//*[#id="login"]'))
)
time.sleep(5)
more_btn.click()
time.sleep(5)
self.driver.execute_script("window.open('https://domain.com/#/admin','_blank');");
time.sleep(10)
window_now = self.driver.window_handles[1]
self.driver.switch_to_window(window_now)
## stop when we reach the desired page
#if self.driver.current_url.endswith('page=20'):
# break
#now scrapy should do the job
time.sleep(10)
response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
time.sleep(10)
for post in response.xpath('//div'):
item = PsCrawlerItem()
print post.xpath('a/span/text()').extract(), post.xpath('a/#href').extract(), post.xpath('a/#ng-href').extract()
You just need to tweak your xpath a little bit as follows. Hope this solves the problem.
for post in response.xpath('//body'):
print post.xpath('//text()').extract(), post.xpath('//a//#href').extract()