Find() takes no keyword arguments # web scraping - webscarab

please help me find the error as i didn’t understand for correctly :
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
response = requests.get(url)
with open("imdb_top_250_movies.html", mode='wb') as file:
file.write(response.content)
soup = BeautifulSoup(response.content, 'lxml')
df_list = []
for movie in soup:
title = movie.find('td' , class_="titleColumn").find('a').contents[0]
year = movie.find('td' , class_="titleColumn").find('span').contents[0][1:-1]
user_rating = movie.find('td' , class_="ratingColumn imdbRating").find('strong').contents[0]
df_list.append({'title': title,
'year': int(year),
'user_ratings': float(user_rating)})
df = pd.DataFrame(df_list, columns = ['title', 'year', 'user_ratings'])
df
This is the error I got
TypeError Traceback (most recent call
last) Input In [125], in <cell line: 8>()
9 soup = BeautifulSoup(response.content, 'lxml')
10 df_list = []
---> 11 title = movie.find('td' , class_="titleColumn").find('a').contents[0]
12 year = soup.find('td' , class_="titleColumn").find('span').contents[0][1:-1]
13 user_rating = soup.find('td' , class_="ratingColumn imdbRating").find('strong').contents[0]
TypeError: find() takes no keyword arguments

Someone helped me with this answer as I wrote For incorrectly :
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.imdb.com/chart/top'
response = requests.get(url)
with open("imdb_top_250_movies.html", mode='wb') as file:
file.write(response.content)
soup = BeautifulSoup(response.content, 'lxml')
df_list = []
for movie in soup.find('tbody' , class_="lister-list").find_all('tr'):
Place = movie.find('td' , class_="titleColumn").contents[0][1:-len('.\n ')]
title = movie.find('td' , class_="titleColumn").find('a').contents[0]
year = movie.find('td' , class_="titleColumn").find('span').contents[0][1:-1]
user_rating = movie.find('td' , class_="ratingColumn imdbRating").find('strong').contents[0]
df_list.append({'place': Place,
'title': title,
'year': int(year),
'user_ratings': float(user_rating)})
df = pd.DataFrame(df_list, columns = ['place','title', 'year', 'user_ratings'])
df.style.hide(axis='index')

Related

How to use tensorflow ai trained text model in react

i have created a simple chatbot model in python from a video tutorial.
Now i have read that i can use this model in react with tendorflow.js lib but i cant get it to run. I searched around a while but i cant find a real working example.
1.st the code for creating the model (train)
training.py
import random
import json
import numpy as np
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import tensorflowjs as tfjs
lemmatizer = WordNetLemmatizer()
intents = json.loads(open("./model/Chatbot/intents.json").read())
words = []
classes = []
documents = []
ignore_letters = ["?", "!", ".", ","]
for intent in intents["intents"]:
for pattern in intent["patterns"]:
word_list = nltk.word_tokenize(pattern)
words.extend(word_list)
documents.append((word_list, intent["tag"]))
if intent["tag"] not in classes:
classes.append(intent["tag"])
words = [lemmatizer.lemmatize(word) for word in words if word not in ignore_letters]
words = sorted(set(words))
classes = sorted(set(classes))
pickle.dump(words, open("./model/Chatbot/words.pkl", "wb"))
pickle.dump(classes, open("./model/Chatbot/classes.pkl", "wb"))
training = []
output_empty = [0] * len(classes)
for document in documents:
bag = []
word_patterns = document[0]
word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
for word in words:
bag.append(1) if word in word_patterns else bag.append(0)
output_row = list(output_empty)
output_row[classes.index(document[1])] = 1
training.append([bag, output_row])
random.shuffle(training)
training = np.array(training)
train_x = list(training[:, 0])
train_y = list(training[:, 1])
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation="softmax"))
sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])
hist = model.fit(np.array(train_x), np.array(train_y), epochs=1000, batch_size=5, verbose=1)
model.save("./model/Chatbot/chatbotmodel.h5", hist)
tfjs.converters.save_keras_model(model, "./model/Chatbot/")
print("Done")
in the pre-last line the model was exported to model.json and 1 group1-shard1of1.bin file
intents.json (example)
{
"intents": [
{
"tag": "greeting",
"patterns": [
"Hey",
"Hola",
"Hello",
"Hi",
"Ist da jemand?",
"Hallo",
"Guten Tag",
"Hey",
"Moin"
],
"responses": [
"Hallo, schön das du hier bist",
"Schoen dich wiederzusehen",
"Hallo, wie kann ich helfen?"
],
"context_set": "greeting"
}
]
}
in python i can start now chatbot.py which works
import random
import json
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow import keras
from keras.models import load_model
lemmatizer = WordNetLemmatizer()
intents = json.loads(open("./model/Chatbot/intents.json").read())
words = pickle.load(open("./model/Chatbot/words.pkl", "rb"))
classes = pickle.load(open("./model/Chatbot/classes.pkl", "rb"), fix_imports=True, encoding="ASCII")
model = load_model("./model/Chatbot/chatbotmodel.h5")
context = ""
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence) # [0 0 0 0 0 0 0 0 0]?
print(np.array([bow]))
res = model.predict(np.array([bow]))[0] # [8.58373418e-02 3.18233818e-02 9.12701711e-02 3.93254980e-02...
print(res)
ERROR_TRESHOLD = 0.25
results = [[i, r] for i, r in enumerate(res) if r > ERROR_TRESHOLD] # Hallo => [[21, 0.35744026]]
results.sort(key=lambda x: x[1], reverse=True) # moin => [[21, 0.35744026]]
return_list = []
for r in results:
return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
return return_list # hallo [{'intent': 'greeting', 'probability': '0.35744026'}]
def get_response(intents_list, intents_json):
tag = intents_list[0]["intent"] # hallo [{'intent': 'greeting', 'probability': '0.35744026'}] ===> 'greeting'
list_of_intents = intents_json["intents"] # ==> alle intents aus datei
print(intents_list)
for i in list_of_intents:
if "context_set" in i:
context = i["context_set"]
print(context)
if i["tag"] == tag:
result = random.choice(i["responses"])
break
return result
print("Go! Bot is running")
while True:
message = input("")
ints = predict_class(message) # # hallo [{'intent': 'greeting', 'probability': '0.35744026'}]
res = get_response(ints, intents)
print(res)
2. Try to get it run in react.
import { useEffect, useState } from 'react';
import * as tf from '#tensorflow/tfjs';
const url = {
model: 'https://example.com/model.json',
};
function App() {
async function loadModel(url) {
try {
let message = "Hallo";
//const inputTensor = tf.tensor([parseInt(message)]);
const model = await tf.loadLayersModel(url.model);
setModel(model);
let result = model.predict(message); // make prediction like in Python
//let bow = bag_of_words(message) // [0 0 0 0 0 0 0 0 0]?
}
catch (err) {
console.log(err);
}
}
useEffect(() => {
tf.ready().then(() => {
loadModel(url)
});
}, [])
}
At this point, the model.json and group1-shard1of1.bin are both imported correct, but when i try to model.predict('hallo') i get the following error:
Error when checking model : the Array of Tensors that you are passing to your model is not the size the the model expected. Expected to see 1 Tensor(s), but instead got 5 Tensors(s).
Maybe u have an idea to solve it? Thanks.

Using AWS Lambda functions in Amazon Neptune

I created an AWS Lambda function using this example code in AWS neptuneDB documentation. I followed documentation properly and set the all necessary lambda environmentale variables as well. But when I tested the function it raises an error. Could you please help me solve this issue. I wonder why the code in the AWS neptunedb documentation does not work.
code:
import os, sys, backoff, math
from random import randint
from gremlin_python import statics
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.driver.protocol import GremlinServerError
from gremlin_python.driver import serializer
from gremlin_python.process.anonymous_traversal import traversal
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.process.traversal import T
from tornado.websocket import WebSocketClosedError
from tornado import httpclient
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest
from botocore.credentials import ReadOnlyCredentials
from types import SimpleNamespace
reconnectable_err_msgs = [
'ReadOnlyViolationException',
'Server disconnected',
'Connection refused'
]
retriable_err_msgs = ['ConcurrentModificationException'] + reconnectable_err_msgs
network_errors = [WebSocketClosedError, OSError]
retriable_errors = [GremlinServerError] + network_errors
def prepare_iamdb_request(database_url):
service = 'neptune-db'
method = 'GET'
access_key = os.environ['AWS_ACCESS_KEY_ID']
secret_key = os.environ['AWS_SECRET_ACCESS_KEY']
region = os.environ['AWS_REGION']
session_token = os.environ['AWS_SESSION_TOKEN']
creds = SimpleNamespace(
access_key=access_key, secret_key=secret_key, token=session_token, region=region,
)
request = AWSRequest(method=method, url=database_url, data=None)
SigV4Auth(creds, service, region).add_auth(request)
return httpclient.HTTPRequest(database_url, headers=request.headers.items())
def is_retriable_error(e):
is_retriable = False
err_msg = str(e)
if isinstance(e, tuple(network_errors)):
is_retriable = True
else:
is_retriable = any(retriable_err_msg in err_msg for retriable_err_msg in retriable_err_msgs)
print('error: [{}] {}'.format(type(e), err_msg))
print('is_retriable: {}'.format(is_retriable))
return is_retriable
def is_non_retriable_error(e):
return not is_retriable_error(e)
def reset_connection_if_connection_issue(params):
is_reconnectable = False
e = sys.exc_info()[1]
err_msg = str(e)
if isinstance(e, tuple(network_errors)):
is_reconnectable = True
else:
is_reconnectable = any(reconnectable_err_msg in err_msg for reconnectable_err_msg in reconnectable_err_msgs)
print('is_reconnectable: {}'.format(is_reconnectable))
if is_reconnectable:
global conn
global g
conn.close()
conn = create_remote_connection()
g = create_graph_traversal_source(conn)
#backoff.on_exception(backoff.constant,
tuple(retriable_errors),
max_tries=5,
jitter=None,
giveup=is_non_retriable_error,
on_backoff=reset_connection_if_connection_issue,
interval=1)
def query(**kwargs):
id = kwargs['id']
return (g.V(id)
.fold()
.coalesce(
__.unfold(),
__.addV('User').property(T.id, id)
)
.id().next())
def doQuery(event):
return query(id=str(randint(0, 10000)))
def lambda_handler(event, context):
return doQuery(event)
def create_graph_traversal_source(conn):
return traversal().withRemote(conn)
def create_remote_connection():
print('Creating remote connection')
return DriverRemoteConnection(
connection_string(),
'g',
pool_size=1,
message_serializer=serializer.GraphSONSerializersV2d0())
def connection_string():
database_url = 'wss://{}:{}/gremlin'.format(os.environ['neptuneEndpoint'], os.environ['neptunePort'])
if 'USE_IAM' in os.environ and os.environ['USE_IAM'] == 'true':
return prepare_iamdb_request(database_url)
else:
return database_url
conn = create_remote_connection()
g = create_graph_traversal_source(conn)
error message:
{
"errorMessage": "'GraphTraversal' object is not callable",
"errorType": "TypeError",
"requestId": "69e6ecd3-1291-4d21-a8fa-1fc910525fc1",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 111, in lambda_handler\n return doQuery(event)\n",
" File \"/var/task/lambda_function.py\", line 108, in doQuery\n return query(id=str(randint(0, 10000)))\n",
" File \"/var/task/backoff/_sync.py\", line 105, in retry\n ret = target(*args, **kwargs)\n",
" File \"/var/task/lambda_function.py\", line 99, in query\n return (g.V(id)\n"
]
}
log output:
LOGS Name: cloudwatch_lambda_agent State: Subscribed Types: [Platform]
Creating remote connection
EXTENSION Name: cloudwatch_lambda_agent State: Ready Events: [INVOKE,SHUTDOWN]
START RequestId: 69e6ecd3-1291-4d21-a8fa-1fc910525fc1 Version: $LATEST
[ERROR] TypeError: 'GraphTraversal' object is not callable
Traceback (most recent call last):
  File "/var/task/lambda_function.py", line 111, in lambda_handler
    return doQuery(event)
  File "/var/task/lambda_function.py", line 108, in doQuery
    return query(id=str(randint(0, 10000)))
  File "/var/task/backoff/_sync.py", line 105, in retry
    ret = target(*args, **kwargs)
  File "/var/task/lambda_function.py", line 99, in query
    return (g.V(id)END RequestId: 69e6ecd3-1291-4d21-a8fa-1fc910525fc1
REPORT RequestId: 69e6ecd3-1291-4d21-a8fa-1fc910525fc1 Duration: 108.64 ms Billed Duration: 109 ms Memory Size: 128 MB Max Memory Used: 88 MB Init Duration: 1025.72 ms

Inserting data from csv too slow Django

My problem is that inserting data into the database is too slow. I have written a correct algorithm that loads the data properly however with this method it will take 700 hours to load the database. There are almost 30 million records in the csv file.
Here is my models.py
from django.db import models
class Region(models.Model):
region = models.CharField(max_length=20)
class Rank(models.Model):
rank = models.IntegerField()
class Chart(models.Model):
chart = models.CharField(max_length=8)
class Artist(models.Model):
artist = models.CharField(max_length=60)
class Title(models.Model):
title = models.CharField(max_length=60)
class ArtistTitle(models.Model):
artist = models.ForeignKey(Artist, on_delete=models.CASCADE)
title = models.ForeignKey(Title, on_delete=models.CASCADE)
class SpotifyData(models.Model):
title = models.ForeignKey(ArtistTitle, related_name='re_title', on_delete=models.CASCADE)
rank = models.ForeignKey(Rank, on_delete=models.CASCADE)
date = models.DateField()
artist = models.ForeignKey(ArtistTitle, related_name='re_artist', on_delete=models.CASCADE)
region = models.ForeignKey(Region, on_delete=models.CASCADE)
chart = models.ForeignKey(Chart, on_delete=models.CASCADE)
streams = models.IntegerField()
My upload script looks like this:
def load_to_db(self, df):
bad = 0
good = 0
start = datetime.datetime.now
for _, row in df.iterrows():
try:
region_obj, _ = Region.objects.get_or_create(
region=row["region"],
)
rank_obj, _ = Rank.objects.get_or_create(
rank=row["rank"],
)
chart_obj, _ = Chart.objects.get_or_create(
chart=row["chart"],
)
artist_obj, _ = Artist.objects.get_or_create(
artist=row["artist"],
)
title_obj, _ = Title.objects.get_or_create(
title=row["title"],
)
arttit_obj, _ = ArtistTitle.objects.update_or_create(
artist=artist_obj,
title=title_obj,
)
spotifydata_obj, _ = SpotifyData.objects.update_or_create(
title=arttit_obj,
rank=rank_obj,
date=row["date"],
artist=arttit_obj,
region=region_obj,
chart=chart_obj,
streams=row["streams"],
)
good += 1
now = datetime.datetime.now
print(f"goods: {good}, loading time: {start-now}", )
except Exception as e:
bad += 1
current_time = datetime.datetime.now()
with open("data_load_logging.txt", "w") as bad_row:
bad_row.write(
f"Error message: {e} \n"
+ f"time: {current_time}, \n"
+ f"title: {row['title']}, type: {row['title']} \n"
+ f"rank: {int(row['rank'])}, type: {int(row['rank'])} \n"
+ f"date: {row['date']}, type: {row['date']} \n"
+ f"artist: {row['artist']}, type: {row['artist']} \n"
+ f"region: {row['region']}, type: {row['region']} \n"
+ f"chart: {row['chart']}, type: {row['chart']} \n"
+ f"streams: {int(row['streams'])}, type: {int(row['streams'])} \n"
+ "-" * 30
+ "\n"
)
I know it could probably help to use bulk/bulk_create/bulk_update but I can't figure out how to write the correct script....
def load_to_db(self, path):
start_time = timezone.now()
try:
with open(path, "r") as csv_file:
data = csv.reader(csv_file)
next(data)
packet_region = []
packet_rank = []
packet_chart = []
packet_artist = []
packet_title = []
packet_artist_title = []
packet_spotify_data = []
bad = -1 # first row is a header
for row in data:
region = Region(
region = row[4]
)
rank = Rank(
rank = row[1]
)
chart = Chart(
chart = row[5]
)
artist = Artist(
artist = row[3]
)
title = Title(
title = row[0]
)
artist_title = ArtistTitle(
artist = artist,
title = title
)
spotify_data = SpotifyData(
title = artist_title,
rank = rank,
date = row[3],
artist = artist_title,
region = region,
chart = chart,
streams = int(row[6])
)
packet_region.append(region)
packet_rank.append(rank)
packet_chart.append(chart)
packet_artist.append(artist)
packet_title.append(title)
packet_artist_title.append(artist_title)
packet_spotify_data.append(spotify_data)
if len(packet_spotify_data) > 1000:
print(datetime.datetime.now())
Region.objects.bulk_create(packet_region)
Rank.objects.bulk_create(packet_rank)
Chart.objects.bulk_create(packet_chart)
Artist.objects.bulk_create(packet_artist)
Title.objects.bulk_create(packet_title)
ArtistTitle.objects.bulk_update(packet_artist_title)
SpotifyData.objects.bulk_update(packet_spotify_data)
packet_region = []
packet_rank = []
packet_chart = []
packet_artist = []
packet_title = []
packet_artist_title = []
packet_spotify_data = []
logging.info(f"Failure numbers: {bad}")
if packet_spotify_data:
Region.objects.bulk_create(packet_region)
Rank.objects.bulk_create(packet_rank)
Chart.objects.bulk_create(packet_chart)
Artist.objects.bulk_create(packet_artist)
Title.objects.bulk_create(packet_title)
ArtistTitle.objects.bulk_update(packet_artist_title)
SpotifyData.objects.bulk_update(packet_spotify_data)
except FileNotFoundError as e:
raise NoFilesException("No such file or directory") from e
end_time = timezone.now()
self.stdout.write(
self.style.SUCCESS(
f"Loading CSV took: {(end_time-start_time).total_seconds()} seconds."
)
)
I tried to use bulk this way but unfortunately it doesn't work

Display dash datatable through callback

I want to be able to return a populated dash table based on the results from an input search. I've tried 2 methods so far - returning the entire DashTable in the callback output and returning the columns and data separately in the callback. Both options haven't been working for me. I've included the relevant code for each option and the error message that results from each:
Return the data and columns separately:
#app.callback(
[Output('table', 'data'),
Output('table', 'columns')],
[Input("button", "n_clicks")], state=[State('url', 'value')])
def update_table(n_click:int, url):
if n_click>1:
summary, table = summarizer(url)
columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in table.columns]
table = table.to_dict('records')
return table, columns
else:
return [], []
The app.layout contains the following line
html.Div(dt.DataTable(id='table'))
The error message that results from this is:
Objects are not valid as a React child
The second approach was to pass in the entire DataTable through the callback and display it using just the html.Div in the layout like this
#app.callback(
Output('table', 'children'),
[Input("button", "n_clicks")], state=[State('url', 'value')])
def update_table(n_click:int, url):
if n_click>1:
summary, table = summarizer(url)
columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in table.columns]
table = table.to_dict('records')
return dt.DataTable(data=table, columns=columns)
else:
return []
html.Div(id='table')
The corresponding error was
[Objects are not valid as a React child][2]
This error is confusing to me since it seems to be regarding the column definition however I can't pass in an array and the documentation asks for a dictionary.
Full code sample:
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
import dash_table as dt
from dash.dependencies import Input, Output, State
import sd_material_ui
from newspaper import Article
import gensim
from gensim.summarization import summarize
from dash.exceptions import PreventUpdate
from newspaper import fulltext
import requests
import pandas as pd
import yake
import nltk
from newsapi import NewsApiClient
leftSources = ["cnn", "buzzfeed", "the-washington-post", "bbc-news", "vice-news", "newsweek", "techcrunch", "reuters", "politico", "newsweek", "msnbc"]
rightSources = ["fox-news", "national-review", "new-york-magazine", "breitbart-news", "business-insider", "the-wall-street-journal", "bloomberg", "the-washington-times", "the-hill", "the-american-conservative"]
# importing CSS
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
# similarArticleURL
getSimilarArticlesURL = "https://us-central1-secure-site-266302.cloudfunctions.net/getSimilarArticles?keywords="
getKeywordsURL = "https://us-central1-secure-site-266302.cloudfunctions.net/getKeyword?text="
getArticleTextURL = "https://us-central1-secure-site-266302.cloudfunctions.net/getArticleText?url="
allData = pd.DataFrame()
# instantiating dash application
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
server = app.server # the flask app
# helper functions
def generate_table(dataframe, max_rows=10):
return html.Table([
html.Thead(
html.Tr([html.Th(col) for col in dataframe.columns])
),
html.Tbody([
html.Tr([
html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
]) for i in range(min(len(dataframe), max_rows))
])
])
app.layout = html.Div([
html.Div(html.H3("Brief.Me"), style={'font-weight':'bold','background-color':'darkorange', 'color':'white','text-align':'center'}),
html.Br(),
html.Br(),
dbc.Row([
dbc.Col(dbc.Input(id='url', type='url', size=30, placeholder="Type or copy/paste an URL"), width={'size':6, 'order':1, 'offset':3}),
dbc.Col(dbc.Button("Summarize", id='button', n_clicks=1, color="primary", className="mr-1"), width={'order':2})
]),
html.Br(),
# dbc.Row([
# dbc.Col(dcc.Loading(html.Div(html.Div(id="summary"), style={'font-weight':'bold'})), width={'size':6, 'offset':3})
# ]),
html.Div(id='table')
],
)
def fetch_similar_articles(keyword):
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
newsapi = NewsApiClient(api_key='ce7482cbd40f4d90a8eea404e7702db6')
top_headlines = newsapi.get_top_headlines(q=keyword,
sources='bbc-news,the-wall-street-journal,the-washington-post,fox-news,bloomberg, vice-news, politico, reuters, the-hill',
language='en')
return top_headlines["articles"]
def fetch_article_text(url):
try:
article = Article(url)
article.download()
article.parse()
return article.text
except:
return None
def summarizer(url):
global allData
leftSummaries, rightSummaries = {}, {}
text = fetch_article_text(url)
main_summary = summarize(text)
keywords = extract_keywords(text)
urls = []
rightData, leftData, allData = get_articles_content(keywords)
rightDf, leftDf = pd.DataFrame(rightData), pd.DataFrame(leftData)
allSources = pd.concat([rightDf, leftDf], axis=1)
return main_summary, allData
def get_articles_content(keywords):
'''
This function will return a row of the dataframe where there is a title, source, url and summary.
'''
allResults, leftRows, rightRows = [], [], []
for keyword in keywords:
articleList = fetch_similar_articles(keyword)
for elem in articleList:
source = elem['source']
url = elem['url']
title = elem['title']
text = fetch_article_text(url)
if text is not None and len(text) > 1:
summary = summarize(text)
allResults.append({'title': title, 'url': url,'source': source, 'summary': summary})
if source in leftSources:
leftRows.append(pd.DataFrame({'title': title, 'url': url,'source': source, 'summary': summary}))
elif source in rightSources:
rightRows.append(pd.DataFrame({'title': title, 'url': url, 'source': source, 'summary': summary}))
allResults = pd.DataFrame(allResults)
return leftRows, rightRows, allResults
def extract_keywords_yake(text, phrase_length, num_keywords):
custom_kw_extractor = yake.KeywordExtractor(n=phrase_length, top=num_keywords)
keywords = custom_kw_extractor.extract_keywords(text)
return keywords
def extract_keywords(text):
'''
Returns a list of keywords given the article text.
'''
global getKeywordsURL
getKeywordsURL += text
keywordRes = extract_keywords_yake(text, 2, 5)
keywords = []
for pair in keywordRes:
keywords.append(pair[1])
return keywords
#app.callback( # Output('summary', 'children')
Output('table', 'children'),
[Input("button", "n_clicks")], state=[State('url', 'value')])
def update_table(n_click:int, url):
if n_click>1:
summary, table = summarizer(url)
columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in table.columns]
table = table.to_dict('records')
return dt.DataTable(data=table, columns=columns)
else:
return [], []
if __name__ == '__main__':
app.run_server(debug=True, host='0.0.0.0', port=8080)

Creating a dataset

I am trying to extract tweets from a specific hashtag, and save them in csv file. The below code works well, but I would like to split data. How can I split it.
Any advice will be highly appreciated,
Niddal
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import json
import codecs
import sys
ckey = ''
csecret = ''
atoken = ''
asecret = ''
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
class StdOutListener(StreamListener):
def on_data(self, data):
try:
tweet = json.loads(data)['text']
#tweet = data.split(',"text":"')[1].split('","source')[0]
print(tweet.translate(non_bmp_map))
saveThis = str(time.time())+'::'+tweet
SaveFile = codecs.open('d:\\StremHash.csv','a', "utf-8")
SaveFile.write(saveThis)
SaveFile.write('\n')
SaveFile.close()
return True
except BaseException, e:
print ('failed on data,',str(e))
time.sleep(5)
def on_error(self, status):
print(status)
if __name__ == '__main__':
l = StdOutListener()
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, l)
twitterStream.filter(track=[unicode("#عيدكم_مبارك","utf-8")])

Resources