Ok guys I am having tons of problems getting my working dev server to a working production server :). I have a task that will go through and request urls and collect and update data. It takes 30 minutes to run.
I uploaded to production server and going to the url with its corresponding .py script appname.appspot.com/tasks/rrs after 30 seconds I am getting the class google.appengine.runtime.DeadlineExceededError' Is there any way to get around this? Is this a 30 second deadline for a page? This script works fine in development server I go to the url and the associate .py script runs until completion.
import time
import random
import string
import cPickle
from StringIO import StringIO
try:
import json
except ImportError:
import simplejson as json
import urllib
import pprint
import datetime
import sys
sys.path.append("C:\Program Files (x86)\Google\google_appengine")
sys.path.append("C:\Program Files (x86)\Google\google_appengine\lib\yaml\lib")
sys.path.append("C:\Program Files (x86)\Google\google_appengine\lib\webob")
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app
from google.appengine.ext import db
class SR(db.Model):
name = db.StringProperty()
title = db.StringProperty()
url = db.StringProperty()
##request url and returns JSON_data
def overview(page):
u = urllib.urlopen(page)
bytes = StringIO(u.read())
##print bytes
u.close()
try:
JSON_data = json.load(bytes)
return JSON_data
except ValueError,e:
print e," Couldn't get .json for %s" % page
return None
##specific code to parse particular JSON data and append new SR objects to the given url list
def parse_json(JSON_data,lists):
sr = SR()
sr.name = ##data gathered
sr.title = ##data gathered
sr.url = ##data gathered
lists.append(sr)
return lists
## I want to be able to request lets say 500 pages without timeing out
page = 'someurlpage.com'##starting url
url_list = []
for z in range(0,500):
page = 'someurlpage.com/%s'%z
JSON_data = overview(page)##get json data for a given url page
url_list = parse_json(JSON_data,url_list)##parse the json data and append class objects to a given list
db.put(url_list)##finally add object to gae database
Yes, the App Engine imposes a 30 seconds deadline. One way around it might be a try/except DeadlineExceededError and putting the rest in a taskqueue.
But you can't make your requests run for a longer period.
You can also try Bulkupdate
Example:
class Todo(db.Model):
page = db.StringProperty()
class BulkPageParser(bulkupdate.BulkUpdater):
def get_query(self):
return Todo.all()
def handle_entity(self, entity):
JSON_data = overview(entity.page)
db.put(parse_json(JSON_data, [])
entity.delete()
# Put this in your view code:
for i in range(500):
Todo(page='someurlpage.com/%s' % i).put()
job = BulkPageParser()
job.start()
ok so if I am dynamically adding links as I am parsing the pages, I would add to the todo queue like so I believe.
def handle_entity(self, entity):
JSON_data = overview(entity.page)
data_gathered,new_links = parse_json(JSON_data, [])##like earlier returns the a list of sr objects, and now a list of new links/pages to go to
db.put(data_gathered)
for link in new_links:
Todo(page=link).put()
entity.delete()
Related
I have a really simple web app. All the important stuff happens in index.py:
from google.appengine.api import users
import webapp2
import os
import jinja2
JINJA_ENVIRONMENT = jinja2.Environment(
loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
extensions=['jinja2.ext.autoescape'],
autoescape=True)
def get_user():
user = {}
user['email'] = str(users.get_current_user())
user['name'], user['domain'] = user['email'].split('#')
user['logout_link'] = users.create_logout_url('/')
return user
class BaseHandler(webapp2.RequestHandler):
def dispatch(self):
user = get_user()
template_values = {'user': user}
if user['domain'] != 'foo.com':
template_values['page_title'] = 'Access Denied'
template = '403'
else:
template_values['page_title'] = 'Home'
template = 'index'
template_engine = JINJA_ENVIRONMENT.get_template('%s.html' % template)
self.response.write(template_engine.render(template_values))
app = webapp2.WSGIApplication([
('/', BaseHandler),
], debug=True)
I'm trying to be a good person and write some local unit tests but - after looking at the documentation - I am totally out of my depth. All I want is a basic framework where I can do something like:
python test_security.py
and simulate two users hitting the domain - one #foo.com who should get the index template, and one #bar.com who should get the 403 template.
Here's where I've got so far:
import sys
# I don't want to talk about it, let's just ignore this block
sys.path.append('C:\Program Files (x86)\Google\google_appengine\lib\webapp2-2.5.2')
sys.path.append('C:\Program Files (x86)\Google\google_appengine\lib\webob-1.2.3')
sys.path.append('C:\Program Files (x86)\Google\google_appengine\lib\jinja2-2.6')
sys.path.append('C:\Program Files (x86)\Google\google_appengine\lib\yaml-3.10')
sys.path.append('C:\Program Files (x86)\Google\google_appengine\lib\jinja2-2.6')
sys.path.append('C:\Program Files (x86)\Google\google_appengine')
sys.path.append('C:\pytest')
# A few proper imports
import unittest
import webapp2
from google.appengine.ext import testbed
# Import the module I'd like to test
import index
class TestHandlers(unittest.TestCase):
def test_hello(self):
self.testbed = testbed.Testbed()
self.testbed.init_user_stub()
self.testbed.setup_env(USER_EMAIL='test#foo.com',USER_ID='1', USER_IS_ADMIN='0')
request = webapp2.Request.blank('/')
response = request.get_response(main.app)
print "running test"
self.assertEqual(response.status_int, 200)
self.assertEqual(response.body, 'Hello, world!')
Predictably, this doesn't work at all. What am I missing? Am I just wildly overestimating how simple this should be?
If you're planning on invoking this with "python test_security.py", the magic words you are looking for are:
if __name__ == '__main__':
unittest.main()
This will make your unit test run - at the moment all you're doing is defining it.
Note also that you'll need to change your request.get_response from "main.app" to "index.app".
I suspect (primarily based on the function names) that you should call self.testbed.init_user_stub() before calling self.testbed.setup_env(), not after.
Also you seem to be missing an initial self.testbed = testbed.Testbed() and possibly a testbed.activate() call.
You might want to check out this answer: https://stackoverflow.com/a/21139805/4495081
I'm trying to use the following script, but instead of typing in the URL, I want it to loop and pull the URLs from Links.CSV file. Eventually I want to export all the results into a new CSV file.
import csv
import requests
from BeautifulSoup import BeautifulSoup
from urllib import urlopen
url = (LINK)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
Title = soup.find(id="productTitle")
Price = soup.find(id="priceblock_ourprice")
print Title.text, Price.text
Any help would be appreciated.
I am trying to create a simple web application using Google App Engine. I use jinja2 to render a frontend html file. User enters their AWS credentials and gets the output of regions and connected with them virtual machines.
I have a controller file, to which I import a model file and it looks like this:
import webapp2
import jinja2
import os
import model
jinja_environment = jinja2.Environment(loader=jinja2.FileSystemLoader(os.path.dirname(__file__)))
class MainPage(webapp2.RequestHandler):
def get(self):
template = jinja_environment.get_template('index.html')
self.response.out.write(template.render())
def request_a(self):
a = self.reguest.get('a')
return a
def request_b(self):
b = self.reguest.get('b')
return b
class Responce(webapp2.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/plain'
self.response.write(testing_ec2.me.get_machines())
app = webapp2.WSGIApplication([('/2', MainPage), ('/responce', Responce)], debug=True)
then I have a model file, to which I import controller file and it looks like this:
import boto.ec2
import controller
import sys
if not boto.config.has_section('Boto'):
boto.config.add_section('Boto')
boto.config.set('Boto', 'https_validate_certificates', 'False')
a = controller.MainPage.get()
b = controller.MainPage.get()
class VM(object):
def __init__(self, a, b):
self.a = a
self.b = b
self.regions = boto.ec2.regions(aws_access_key_id = a, aws_secret_access_key = b)
def get_machines(self):
self.region_to_instance = {}#dictionary, which matches regions and instances for this region
for region in self.regions:
conn = region.connect(aws_access_key_id = self.a, aws_secret_access_key = self.b)
reservations = conn.get_all_instances()#get reservations(need understand them better)
if len(reservations) > 0:#if there are reservations in this region
self.instances = [i for r in reservations for i in r.instances]#creates a list of instances for that region
self.region_to_instance[region.name] = self.instances
return self.region_to_instance
me = VM(a, b)
me.get_machines()
When I run this, it throws an error: type object 'MainPage' has no attribute 'request_a'
I assume, that it happens, because I do not call an instance of MainPage class and instead call a class itself. What is an instance of MainPage(and it`s parent webapp.RequestHandler) class? How do I call it inside another module?
Your code looks very strange to me. I do not understand your coding practice.
The general answer is : if you like to use methods of your MainPage, you can use inheritance.
But. If I understand the goal of your code. Why not call boto from your Responce class. But, here you use a get, where you should use a post, because you post a form with AWS credentials.
So I suggest :
create a MainPage with get and post methods to handle the form
in the the post method make the boto requests and send the result with jinja to the user.
See also Getting started with GAE Python27 and handling forms:
https://developers.google.com/appengine/docs/python/gettingstartedpython27/handlingforms?hl=nl
I'd like to be able to load a blob(image into the Python Image Processing Library or into a numpy array for analysis(such as mean, median, standard deviation) without using the serving url.
Here is my image database
the t_image_url contains the serving url for the blob
from google.appengine.ext import db, blobstore
class ImageModel(db.Model):
t_image = blobstore.BlobReferenceProperty(required=True)
t_imageUrl = db.StringProperty(required = True)
here is a segment of what I tried
import numpy as np
import Image
import ImageOps
class ImageAnalysisHandler(BaseHandler):
def get(self, imageModel_id):
if self.user:
i = ImageModel.get_by_id(int(imageModel_id))
OpenedImage = Image.open(i.t_image)
self.render('imageAnalysis.html', imageD = i)
else:
self.redirect('login')
This obviously didn't work since the Image Module(from the Python Imaging Library) doesn't know how to read blobs. I was wondering if anyone knew how to read in a blob into PIL or a numpy array accurately.
Take a look at the BlobReader class. It let you read a file store in blobstore with a file-like interface.
After a lot of struggle trying to crawl an angular Js page with Single sign-on I have put up this code. This code runs fine, logins opens the desired page and scraps it but I am not getting all the links and text present in the website loaded by angular. My xpath seems to be correct.
Also it is not crawling the links that are getting extracted. What do I need to change in my code to extract all text present in the website and the subsequent webpages?
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ps_crawler.items import PsCrawlerItem
import time
from selenium.webdriver.common.keys import Keys
class SISSpider(scrapy.Spider):
name = "SIS"
allowed_domains = ["domain.com"]
start_urls = ["https://domain.com/login?"]
def __init__(self):
self.driver = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get("https://domain.com/login?")
time.sleep(5)
self.driver.find_element_by_xpath('//*[#id="Login"]/div[2]/div[1]/div[2]/form/div[1]/input').send_keys("ssasdad")
self.driver.find_element_by_xpath('//*[#id="Login"]/div[2]/div[1]/div[2]/form/div[2]/input').send_keys("")
#self.driver.find_element_by_xpath('//*[#id="login"]').click()
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.XPATH, '//*[#id="login"]'))
)
time.sleep(5)
more_btn.click()
time.sleep(5)
self.driver.execute_script("window.open('https://domain.com/#/admin','_blank');");
time.sleep(10)
window_now = self.driver.window_handles[1]
self.driver.switch_to_window(window_now)
## stop when we reach the desired page
#if self.driver.current_url.endswith('page=20'):
# break
#now scrapy should do the job
time.sleep(10)
response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
time.sleep(10)
for post in response.xpath('//div'):
item = PsCrawlerItem()
print post.xpath('a/span/text()').extract(), post.xpath('a/#href').extract(), post.xpath('a/#ng-href').extract()
You just need to tweak your xpath a little bit as follows. Hope this solves the problem.
for post in response.xpath('//body'):
print post.xpath('//text()').extract(), post.xpath('//a//#href').extract()