ValueError: Unknown protobuf attr type <type 'datetime.date'> - google-app-engine

Getting an error in executing the code. I have a datastore entity which has a property of type Date. An example date property value stored in an entity for a particular row is 2016-01-03 (19:00:00.000) EDT
The code i am executing is filtering the entity values based on date greater than 2016-01-01. Any idea what is wrong with the code
Error
ValueError: Unknown protobuf attr type <type 'datetime.date'>
Code
import pandas as pd
import numpy as np
from datetime import datetime
from google.cloud import datastore
from flask import Flask,Blueprint
app = Flask(__name__)
computation_cron= Blueprint('cron.stock_data_transformation', __name__)
#computation_cron.route('/cron/stock_data_transformation')
def cron():
ds = datastore.Client(project="earningspredictor-173913")
query = ds.query(kind='StockPrice')
query.add_filter('date', '>', datetime.strptime("2016-01-01", '%Y-%m-%d').date())
dataframe_data = []
temp_dict = {}
for q in query.fetch():
temp_dict["stock_code"] = q["stock_code"]
temp_dict["date"] = q["date"]
temp_dict["ex_dividend"] = q["ex_dividend"]
temp_dict["split_ratio"] = q["split_ratio"]
temp_dict["adj_open"] = q["adj_open"]
temp_dict["adj_high"] = q["adj_high"]
temp_dict["adj_low"] = q["adj_low"]
temp_dict["adj_close"] = q["adj_close"]
temp_dict["adj_volume"] = q["adj_volume"]
dataframe_data.append(temp_dict)
sph = pd.DataFrame(data=dataframe_data,columns=temp_dict.keys())
# print sph.to_string()
query = ds.query(kind='EarningsSurprise')
query.add_filter('act_rpt_date', '>', datetime.strptime("2016-01-01", '%Y-%m-%d').date())
dataframe_data = []
temp_dict = {}
for q in query.fetch():
temp_dict["stock_code"] = q["stock_code"]
temp_dict["eps_amount_diff"] = q["eps_amount_diff"]
temp_dict["eps_actual"] = q["eps_actual"]
temp_dict["act_rpt_date"] = q["act_rpt_date"]
temp_dict["act_rpt_code"] = q["act_rpt_code"]
temp_dict["eps_percent_diff"] = q["eps_percent_diff"]
dataframe_data.append(temp_dict)
es = pd.DataFrame(data=dataframe_data,columns=temp_dict.keys())

You seem to be using the generic google-cloud-datastore client library, not the NDB Client Library.
For google-cloud-datastore all date and/or time properties have the same format. From Date and time:
JSON
field name: timestampValue
type: string (RFC 3339 formatted, with milliseconds, for instance 2013-05-14T00:01:00.234Z)
Protocol buffer
field name: timestamp_value
type: Timestamp
Sort order: Chronological
Notes: When stored in Cloud Datastore, precise only to microseconds; any additional precision is rounded down.
So when setting/comparing such properties try to use strings formatted as specified (or integers for protobuf Timestamp?), not directly objects from the datetime modules (which work with the NDB library). The same might be true for queries as well.
Note: this is based on documentation only, I didn't use the generic library myself.

Related

Sentiment140 Preprocessing

I have been trying to do some preprocessing on the Sentiment140 database on Kaggle: https://www.kaggle.com/kazanova/sentiment140
The code I'm using is this:
import os
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
print('Corpus size: {}'.format(len(corpus)))
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
tokenized_corpus = []
for i, tweet in enumerate(corpus):
tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('#')]
tokenized_corpus.append(tokens)
print(tokenized_corpus)
However, I keep getting this error:
TypeError: '_io.TextIOWrapper' object is not subscriptable
Can anyone help me understand how to solve the issue?
Thanks in advance
TL;DR
To read .csv or structured datasets, use pandas https://pandas.pydata.org/ or any other dataframe libraries.
In Long:
Instead of doing:
Base_location = ''
dataset_location = os.path.join(Base_location, 'Sentiment140.csv')
corpus = []
labels = []
# Parse tweets and sentiments
with open(dataset_location, 'r', encoding='latin-1') as df:
for i, line in enumerate(df):
parts = line.strip().split(',')
# Sentiment (0 = Negative, 1 = Positive)
labels.append(str(parts[0].strip()))
# Tweet
tweet = parts[5].strip()
if tweet.startswith('"'):
tweet = tweet[1:]
if tweet.endswith('"'):
tweet = tweet[::-1]
corpus.append(tweet.strip().lower())
You could simply read the .csv file with pandas, e.g.
import pandas as pd
corpus = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')
Then use the .apply() function to process the tweets:
"""
Columns
====
target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
"""
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer
import pandas as pd
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
header=None,
names=['target', 'ids', 'date', 'flag', 'user', 'text'],
encoding='latin-1')
tokenizer = RegexpTokenizer('[a-zA-Z0-9#]+')
stemmer = LancasterStemmer()
def process_tweet(tweet):
return [stemmer.stem(token) if not token.startswith('#') else token
for token in tokenizer.tokenize(tweet)]
# 1. Cast the column type to string
# 2. Lowercase it
# 3. Iterate throw each row and get the output from process_tweet()
# 4. # 3. Keep in a new column call `tokenized_text`
df['tokenized_text']= df['text'].str.lower().apply(process_tweet)

WatsonApiException: Error: invalid-api-key, Code: 401

I cant find Alchemy Language API in IBM Watson.
Can I do this with natural-language-understanding service and how?
When I add
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EntitiesOptions, KeywordsOptions
It shows some error with combined keyword
# In[]:
import tweepy
import re
import time
import math
import pandas as pd
from watson_developer_cloud import AlchemyLanguageV1
def initAlchemy():
al = AlchemyLanguageV1(api_key='GRYVUMdBbOtJXxNOIs1aopjjaiyOmLG7xJBzkAnvvwLh')
return al
def initTwitterApi():
consumer_key = 'OmK1RrZCVJSRmKxIuQqkBExvw'
consumer_secret = 'VWn6OR4rRgSi7qGnZHCblJMhrSvj1QbJmf0f62uX6ZQWZUUx5q'
access_token = '4852231552-adGooMpTB3EJYPHvs6oGZ40qlo3d2JbVjqUUWkJ'
access_token_secret = 'm9hgeM9p0r1nn8IoQWJYBs5qUQu56XmrAhsDSYKjuiVA4'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
return api
'''This function is implemented to handle tweepy exception errors
because search is rate limited at 180 queries per 15 minute window by twitter'''
def limit(cursor):
while True:
try:
yield cursor.next()
except tweepy.TweepError as error:
print(repr(error))
print("Twitter Request limit error reached sleeping for 15 minutes")
time.sleep(16*60)
except tweepy.RateLimitError:
print("Rate Limit Error occurred Sleeping for 16 minutes")
time.sleep(16*60)
def retrieveTweets(api, search, lim):
if(lim == ""):
lim = math.inf
else:
lim = int(lim)
text = []
for tweet in limit(tweepy.Cursor(api.search, q=search).items(limit = lim)):
t = re.sub('\s+', ' ', tweet.text)
text.append(t)
data = {"Tweet":text,
"Sentiment":"",
"Score":""}
dataFrame = pd.DataFrame(data, columns=["Tweet","Sentiment","Score"])
return dataFrame
def analyze(al,dataFrame):
sentiment = []
score = []
for i in range(0, dataFrame["Tweet"].__len__()):
res = al.combined(text=dataFrame["Tweet"][i],
extract="doc-sentiment",
sentiment=1)
sentiment.append(res["docSentiment"]["type"])
if(res["docSentiment"]["type"] == "neutral"):
score.append(0)
else:
score.append(res["docSentiment"]["score"])
dataFrame["Sentiment"] = sentiment
dataFrame["Score"] = score
return dataFrame
def main():
#Initialse Twitter Api
api = initTwitterApi()
#Retrieve tweets
dataFrame = retrieveTweets(api,input("Enter the search query (e.g. #hillaryclinton ) : "), input("Enter limit for number of tweets to be searched or else just hit enter : "))
#Initialise IBM Watson Alchemy Language Api
al = initAlchemy()
#Do Document Sentiment analysis
dataFrame = analyze(al, dataFrame)
#Save tweets, sentiment, and score data frame in csv file
dataFrame.to_csv(input("Enter the name of the file (with .csv extension) : "))
if __name__ == '__main__':
main()# -*- coding: utf-8 -*-
The Watson Natural Language Understanding only has a combined call, but since it is the only call, it isn't called combined, its actually analyze. Best place to go for details would be the API documentation - https://www.ibm.com/watson/developercloud/natural-language-understanding/api/v1/?python#post-analyze

Error when Importing keras in embedded python in C

I'm trying to embed python in my C application. I download the package in python official website and manage to do a simple Hello World.
Now I want to go deeper and use some libraries of python like numpy, keras, tensorflow...
I'm working with Python 3.5.4, I installed all the needed package on my PC with pip3 :
pip3 install keras
pip3 install tensorflow
...
then I created my script and launch it in python environment, it works fine :
Python:
# Importing the libraries
#
import numpy as np
import pandas as pd
dataset2 = pd.read_csv('I:\RNA\dataset19.csv')
X_test = dataset2.iloc[:, 0:228].values
y_test = dataset2.iloc[:, 228].values
# 2.
import pickle
sc = pickle.load(open('I:\RNA\isVerb_sc', 'rb'))
X_test = sc.transform(X_test)
# 3.
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
classifier = Sequential()
classifier.add(Dense(units = 114, kernel_initializer = 'uniform', activation = 'relu', input_dim = 228))
classifier.add(Dropout(p = 0.3))
classifier.add(Dense(units = 114, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(p = 0.3))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
classifier.load_weights('I:\RNA\isVerb_weights.h5')
y_pred = classifier.predict(X_test)
y_pred1 = (y_pred > 0.5)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
But when I execute the same script in a C environment with embed python it didn't work :
At first, I execute my script directly with PyRun_SimpleFile with no luck, so I sliced it in multiple instructions with PyRun_SimpleString to detect the problem :
C:
result = PyRun_SimpleString("import numpy as np"); // result = 0 (ok)
result = PyRun_SimpleString("import pandas as pd"); // result = 0 (ok)
...
result = PyRun_SimpleString("import pickle"); // result = 0 (ok)
... (all insctruction above works)
result = PyRun_SimpleString("import keras"); // result = -1 !!
... (all under this failed)
but there is not a single stack trace about this error, I tried this but I just got :
"Here's the output: (null)"
My initialization of Python in C seems correct since others libraries import fine :
// Python
wchar_t *stdProgramName = L"I:\\LIBs\\cpython354";
Py_SetProgramName(stdProgramName);
wchar_t *stdPythonHome = L"I:\\LIBs\\cpython354";
Py_SetPythonHome(stdPythonHome);
wchar_t *stdlib = L"I:\\LIBs\\cpython354;I:\\LIBs\\cpython354\\Lib\\python35.zip;I:\\LIBs\\cpython354\\Lib;I:\\LIBs\\cpython354\\DLLs;I:\\LIBs\\cpython354\\Lib\\site-packages";
Py_SetPath(stdlib);
// Initialize Python
Py_Initialize();
When inside a Python cmd, the line import keras take some time (3sec) but works (a warning but I found no harm around it) :
>>> import keras
I:\LIBs\cpython354\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Using TensorFlow backend.
>>>
I'm at loss now, I don't know where to look at since there is no stack trace.
it seems like when you import keras, it executes this line :
sys.stderr.write('Using TensorFlow backend.\n')
or sys.stderr was not defined in python embedded on windows
A simple correction is to define sys.stderr, for example :
import sys
class CatchOutErr:
def __init__(self):
self.value = ''
def write(self, txt):
self.value += txt
catchOutErr = CatchOutErr()
sys.stderr = catchOutErr

In Django how can I run a custom clean function on fixture data during import and validation?

In a ModelForm I can write a clean_<field_name> member function to automatically validate and clean up data entered by a user, but what can I do about dirty json or csv files (fixtures) during a manage.py loaddata?
Fixtures loaded with loaddata are assumed to contain clean data that doen't need validation (usually as an inverse operation to a prior dumpdata), so the short answer is that loaddata isn't the approach you want if you need to clean your inputs.
However, you probably can use some of the underpinnings of loaddata while implementing your custom data cleaning code--I'm sure you can easily script something using the Django serialization libs to read your existing data files them in and the save the resulting objects normally after the data has been cleaned up.
In case others want to do something similar, I defined a model method to do the cleaning (so it can be called from ModelForms)
MAX_ZIPCODE_DIGITS = 9
MIN_ZIPCODE_DIGITS = 5
def clean_zip_code(self, s=None):
#s = str(s or self.zip_code)
if not s: return None
s = re.sub("\D","",s)
if len(s)>self.MAX_ZIPCODE_DIGITS:
s = s[:self.MAX_ZIPCODE_DIGITS]
if len(s) in (self.MIN_ZIPCODE_DIGITS-1,self.MAX_ZIPCODE_DIGITS-1):
s = '0'+s # FIXME: deal with other intermediate lengths
if len(s)>=self.MAX_ZIPCODE_DIGITS:
s = s[:self.MIN_ZIPCODE_DIGITS]+'-'+s[self.MIN_ZIPCODE_DIGITS:]
return s
Then wrote a standalone python script to clean up my legacy json files using any clean_ methods found among the models.
import os, json
def clean_json(app = 'XYZapp', model='Entity', fields='zip_code', cleaner_prefix='clean_'):
# Set the DJANGO_SETTINGS_MODULE environment variable.
os.environ['DJANGO_SETTINGS_MODULE'] = app+".settings"
settings = __import__(app+'.settings').settings
models = __import__(app+'.models').models
fpath = os.path.join( settings.SITE_PROJECT_PATH, 'fixtures', model+'.json')
if isinstance(fields,(str,unicode)):
fields = [fields]
Ns = []
for field in fields:
try:
instance = getattr(models,model)()
except AttributeError:
print 'No model named %s could be found'%(model,)
continue
try:
cleaner = getattr(instance, cleaner_prefix+field)
except AttributeError:
print 'No cleaner method named %s.%s could be found'%(model,cleaner_prefix+field)
continue
print 'Cleaning %s using %s.%s...'%(fpath,model,cleaner.__name__)
fin = open(fpath,'r')
if fin:
l = json.load(fin)
before = len(l)
cleans = 0
for i in range(len(l)):
if 'fields' in l[i] and field in l[i]['fields']:
l[i]['fields'][field]=cleaner(l[i]['fields'][field]) # cleaner returns None to delete records
cleans += 1
fin.close()
after = len(l)
assert after>.5*before
Ns += [(before, after,cleans)]
print 'Writing %d/%d (new/old) records after %d cleanups...'%Ns[-1]
with open(fpath,'w') as fout:
fout.write(json.dumps(l,indent=2,sort_keys=True))
return Ns
if __name__ == '__main__':
clean_json()

Google Datastore - Blob or Text

2 possible ways of persisting large strings in the Google Datastore are Text and Blob data types.
From a storage consumption perspective, which of the 2 is recommended? Same question from a protobuf serialization and deserialization perspective.
There is no significant performance difference between the two - just use whichever one best fits your data. BlobProperty should be used to store binary data (e.g., str objects) while TextProperty should be used to store any textual data (e.g., unicode or str objects). Note that if you store a str in a TextProperty, it must only contain ASCII bytes (less than hex 80 or decimal 128) (unlike BlobProperty).
Both of these properties are derived from UnindexedProperty as you can see in the source.
Here is a sample app which demonstrates that there is no difference in storage overhead for these ASCII or UTF-8 strings:
import struct
from google.appengine.ext import db, webapp
from google.appengine.ext.webapp.util import run_wsgi_app
class TestB(db.Model):
v = db.BlobProperty(required=False)
class TestT(db.Model):
v = db.TextProperty(required=False)
class MainPage(webapp.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/plain'
# try simple ASCII data and a bytestring with non-ASCII bytes
ascii_str = ''.join([struct.pack('>B', i) for i in xrange(128)])
arbitrary_str = ''.join([struct.pack('>2B', 0xC2, 0x80+i) for i in xrange(64)])
u = unicode(arbitrary_str, 'utf-8')
t = [TestT(v=ascii_str), TestT(v=ascii_str*1000), TestT(v=u*1000)]
b = [TestB(v=ascii_str), TestB(v=ascii_str*1000), TestB(v=arbitrary_str*1000)]
# demonstrate error cases
try:
err = TestT(v=arbitrary_str)
assert False, "should have caused an error: can't store non-ascii bytes in a Text"
except UnicodeDecodeError:
pass
try:
err = TestB(v=u)
assert False, "should have caused an error: can't store unicode in a Blob"
except db.BadValueError:
pass
# determine the serialized size of each model (note: no keys assigned)
fEncodedSz = lambda o : len(db.model_to_protobuf(o).Encode())
sz_t = tuple([fEncodedSz(x) for x in t])
sz_b = tuple([fEncodedSz(x) for x in b])
# output the results
self.response.out.write("text: 1=>%dB 2=>%dB 3=>%dB\n" % sz_t)
self.response.out.write("blob: 1=>%dB 2=>%dB 3=>%dB\n" % sz_b)
application = webapp.WSGIApplication([('/', MainPage)])
def main(): run_wsgi_app(application)
if __name__ == '__main__': main()
And here is the output:
text: 1=>172B 2=>128047B 3=>128047B
blob: 1=>172B 2=>128047B 3=>128047B

Resources