I'm using google app engine and need to have the keys of an entity between 1000 and 2^31. I'm considering 2 ways of doing this:
1) keep a counter of the created keys as detailed here https://cloud.google.com/appengine/articles/sharding_counters. But this requires several datastore read/writes for every key and I'm not sure it is guaranteed to be consistent.
2) generate a random int in my range and check if that key is already in the database. To make it cheap, i'd like a keys_only query, but i can't find a way to do this except saving the key also as a separate field:
MyEntity.query(MyEntity.key_field==new_random_number).fetch(keys_only=True)
Is there a better way to achieve this?
How many writes per second are you expecting in production? Both of your proposals are good, but for our application I decided to go with a sharded counter approach. You can also set the id of an entity before you put it to avoid the query altogether:
MyModel(id="foo")
then you can look it up:
MyModel.get_by_id("foo")
Id doesn't have to be a string, it can be a number also:
MyModel(id=123)
If you decide to go with the sharded counter, here's our production-level code which is darn close what you read in that article ;o) Memcache adds the level of consistency we needed to be able to get the right count.
class GeneralShardedCounterConfig(ndb.Model):
SHARD_KEY_TEMPLATE = 'gen-count-{}-{:d}'
num_shards = ndb.IntegerProperty(default=200)
#classmethod
def all_keys(cls, name):
config = cls.get_or_insert(name)
shard_key_strings = [GeneralShardedCounterConfig.SHARD_KEY_TEMPLATE.format(name, index)
for index in range(config.num_shards)]
return [ndb.Key(GeneralShardedCounter, shard_key_string)
for shard_key_string in shard_key_strings]
class GeneralShardedCounter(BaseModel):
count = ndb.IntegerProperty(default=0)
#classmethod
def get_count(cls, name):
total = memcache.get(name)
if total is None:
total = 0
all_keys = GeneralShardedCounterConfig.all_keys(name)
for counter in ndb.get_multi(all_keys):
if counter is not None:
total += counter.count
memcache.set(name, total, constants.SHORT_MEMCACHE_TTL)
return total
#classmethod
#ndb.transactional(retries=5)
def increase_shards(cls, name, num_shards):
config = GeneralShardedCounterConfig.get_or_insert(name)
if config.num_shards < num_shards:
config.num_shards = num_shards
config.put()
#classmethod
#ndb.transactional(xg=True)
def _increment(cls, name, num_shards):
index = random.randint(0, num_shards - 1)
shard_key_string = GeneralShardedCounterConfig.SHARD_KEY_TEMPLATE.format(name, index)
counter = cls.get_by_id(shard_key_string)
if counter is None:
counter = cls(id=shard_key_string)
counter.count += 1
counter.put()
# Memcache increment does nothing if the name is not a key in memcache
memcache.incr(name)
#classmethod
def increment(cls, name):
config = GeneralShardedCounterConfig.get_or_insert(name)
cls._increment(name, config.num_shards)
#classmethod
def _add(cls, name, value, num_shards):
index = random.randint(0, num_shards - 1)
shard_key_string = GeneralShardedCounterConfig.SHARD_KEY_TEMPLATE.format(name, index)
counter = cls.get_by_id(shard_key_string)
if counter is None:
counter = cls(id=shard_key_string)
counter.count += value
counter.put()
# Memcache increment does nothing if the name is not a key in memcache
memcache.incr(name, value)
#classmethod
def add(cls, name, value):
config = GeneralShardedCounterConfig.get_or_insert(name)
cls._add(name, value, config.num_shards)
Example of get_or_insert. Insert 7 unique keys
import webapp2
from google.appengine.ext import ndb
from datetime import datetime
import random
import logging
class Examples(ndb.Model):
data = ndb.StringProperty()
modified = ndb.DateTimeProperty(auto_now=True)
created = ndb.DateTimeProperty() # NOT auto_now_add HERE !!
class MainHandler(webapp2.RequestHandler):
def get(self):
count = 0
while count < 7:
random_key = str(random.randrange(1, 9))
dt_created = datetime.now()
example = Examples.get_or_insert(random_key, created=dt_created, data='some data for ' + random_key)
if example.created != dt_created:
logging.warning('Random key %s not unique' % random_key)
continue
count += 1
self.response.write('Keys inserted')
app = webapp2.WSGIApplication([
('/', MainHandler)
], debug=True)
Related
I have been trying to build a SagemakerUser from the base User class in the Locust library. The issue though is when I use it with a timed shape test, when said test ends (you can see a message: Shape test stopping) the load test shrugs it off and continues. Below is the script I have written to this end. My question is how is this behaviour explained?
import pandas as pd
from locust import HttpUser, User, task, TaskSet, events, LoadTestShape
from sagemaker.serializers import JSONSerializer
from sagemaker.session import Session
import sagemaker
import time
import sys
import math
import pdb
df = "some df to load samples from"
endpoint = "sage maker end point name"
class SagemakerClient(sagemaker.predictor.Predictor):
def predictEx(self, data):
start_time = time.time()
start_perf_counter = time.perf_counter()
name = 'predictEx'
try:
result = self.predict(data)
except:
total_time = int((time.perf_counter() - start_perf_counter) * 1000)
events.request_failure.fire(request_type="sagemaker", name=name, response_time=total_time, exception=sys.exc_info(), response_length=0)
else:
total_time = int((time.perf_counter() - start_perf_counter) * 1000)
events.request_success.fire(request_type="sagemaker", name=name, response_time=total_time, response_length=sys.getsizeof(result))
class SagemakerLocust(User):
abstract = True
def __init__(self, *args, **kwargs):
super(SagemakerLocust, self).__init__(*args, **kwargs)
self.client = SagemakerClient(
sagemaker_session = Session(),
endpoint_name = "sagemaker-test",
serializer = JSONSerializer())
class APIUser(SagemakerLocust):
#task
def call(self):
request = df.text.sample(1, weights=df.length).iloc[0]
self.client.predictEx(request)
class StepLoadShape(LoadTestShape):
"""
A step load shape
Keyword arguments:
step_time -- Time between steps
step_load -- User increase amount at each step
spawn_rate -- Users to stop/start per second at every step
time_limit -- Time limit in seconds
"""
step_time = 30#3600
step_load = 1
spawn_rate = 1
time_limit =2#3600*6
#pdb.set_trace()
def tick(self):
run_time = self.get_run_time()
if run_time > self.time_limit:
return None
current_step = math.floor(run_time / self.step_time) + 1
return (current_step * self.step_load, self.spawn_rate)
I have a beam pipeline written in python that when deployed to a flink runner doesn't make use of the parallelism correctly.
There is unbounded data coming in through a kafka connector and I want the data to be read when split in parallel.
My understanding is that it should split up the tasks but as shown in the image one parallelism is used and all the other 5 sub tasks finished instantly leaving the one running to do all the work.
The pipeline settings are:
options = PipelineOptions([
"--runner=PortableRunner",
"--sdk_worker_parallelism=3",
"--artifact_endpoint=localhost:8098",
"--job_endpoint=localhost:8099",
"--environment_type=EXTERNAL",
"--environment_config=localhost:50000",
"--checkpointing_interval=30000",
])
options._all_options['parallelism'] = 3
Is this a missing config on the Flink runner or something that can be configured in the BEAM pipeline?
The full pipeline:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
options = PipelineOptions([
"--runner=PortableRunner",
"--sdk_worker_parallelism=3",
"--artifact_endpoint=localhost:8098",
"--job_endpoint=localhost:8099",
"--environment_type=EXTERNAL",
"--environment_config=localhost:50000",
"--checkpointing_interval=30000",
])
options._all_options['parallelism'] = 3
class CountProvider(beam.RestrictionProvider):
def __init__(self, initial_split_size=5):
self._initial_split_size = initial_split_size
self.OffsetRestrictionTracker = None
def imports(self):
if self.OffsetRestrictionTracker is not None: return
from apache_beam.io.restriction_trackers import OffsetRestrictionTracker, OffsetRange
self.OffsetRestrictionTracker = OffsetRestrictionTracker
self.OffsetRange = OffsetRange
def initial_restriction(self, element):
self.imports()
return self.OffsetRange(0, 10)
def create_tracker(self, restriction):
self.imports()
return self.OffsetRestrictionTracker(restriction)
def restriction_size(self, element, restriction):
return restriction.size()*100_000
def split(self, element, restriction):
self.imports()
if restriction.start + 1 >= restriction.stop:
yield self.OffsetRange(restriction.start, restriction.stop)
else:
last_val = restriction.start
for i in range(1, self._initial_split_size):
next_stop = i * (restriction.start + restriction.stop) // self._initial_split_size
yield self.OffsetRange(last_val, next_stop)
last_val = next_stop
yield self.OffsetRange(last_val, restriction.stop)
class CountFn(beam.DoFn):
def setup(self):
print("setup")
def process(self, element, tracker=beam.DoFn.RestrictionParam(CountProvider())):
res = tracker.current_restriction()
print(f"Current Restriction {res.start}, {res.stop}")
for i in range(res.start, res.stop):
if not tracker.try_claim(i):
return
for j in range(10_000):
yield i, j
def get_initial_restriction(self, filename):
return (0, 10)
def teardown(self):
print("Teardown")
p = beam.Pipeline(options=options)
out = (p | f'Create' >> beam.Create([tuple()])
| f'Gen Data' >> beam.ParDo(CountFn())
| beam.Map(print)
)
result = p.run()
result.wait_until_finish()
I'm learning Django and I'm trying to make a Cart, which the customer can get and item and add it in his/her order row and then the order will be submitted. so my teacher said use def initiate(customer), and I don't understand how to use it. Can someone please explain it to me? Thank you.
here is the code I'm working on it:
User = get_user_model()
class Customer(models.Model):
user = models.OneToOneField(User, on_delete=Product, related_name="User")
phone = models.CharField(max_length=20)
address = models.TextField()
balance = models.IntegerField(default=20000)
def deposit(self, amount):
self.balance += amount
self.save()
def spend(self, amount):
if amount > self.balance:
raise ValueError
self.balance -= amount
self.save()
class OrderRow(models.Model):
product = models.ManyToManyField(Product)
order = models.ForeignKey('Order', on_delete=models.CASCADE)
amount = models.IntegerField()
class Order(models.Model):
# Status values. DO NOT EDIT
STATUS_SHOPPING = 1
STATUS_SUBMITTED = 2
STATUS_CANCELED = 3
STATUS_SENT = 4
customer = models.ForeignKey('Customer', on_delete=models.SET_NULL)
order_time = models.DateTimeField(auto_now=True)
total_price = Sum(F('amount') * F('product__price'))
status = models.IntegerField(choices=status_choices)
#staticmethod
def initiate(customer):
Order.initiate(User)
def add_product(self, product, amount):
Order.status = 1
OrderRow.product = Product.objects.get(id=product.id)
print(product.id)
if OrderRow.objects.filter(product=product).exists():
preexisting_order = OrderRow.objects.get(product=product, order=self)
preexisting_order.amount += 1
preexisting_order.save()
else:
new_order = OrderRow.objects.create(
product=product,
cart=self,
amount=1,
)
new_order.save()
You are probably supposed to create a new Order associated with this customer. Something along the following lines:
#classmethod
def initiate(cls, customer):
return cls.objects.create(customer=customer, status=cls.STATUS_SHOPPING)
There are some other issues with your code. You cannot use SET_NULL if the fk is not nullable:
customer = models.ForeignKey('Customer', on_delete=models.SET_NULL, null=true)
There should not be multiple products per row:
class OrderRow(models.Model):
product = models.ForeignKey(Product) # not many2many!
# ...
Also, your add_product needs quite some fixing:
def add_product(self, product, amount):
self.status = self.STATUS_SHOPPING # the instance is self + use your descriptive variables
print(product.id)
# filter only rows in the current order!
if self.orderrow_set.filter(product=product).exists():
# fix naming: this is a row, not an order
preexisting_order_row = self.orderrow_set.get(product=product)
preexisting_order_row.amount += amount # why +1, you are adding amount
preexisting_order_row.save()
else:
new_order_row = OrderRow.objects.create(
product=product,
order=self,
amount=amount,
) # create saves already
I cant find Alchemy Language API in IBM Watson.
Can I do this with natural-language-understanding service and how?
When I add
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EntitiesOptions, KeywordsOptions
It shows some error with combined keyword
# In[]:
import tweepy
import re
import time
import math
import pandas as pd
from watson_developer_cloud import AlchemyLanguageV1
def initAlchemy():
al = AlchemyLanguageV1(api_key='GRYVUMdBbOtJXxNOIs1aopjjaiyOmLG7xJBzkAnvvwLh')
return al
def initTwitterApi():
consumer_key = 'OmK1RrZCVJSRmKxIuQqkBExvw'
consumer_secret = 'VWn6OR4rRgSi7qGnZHCblJMhrSvj1QbJmf0f62uX6ZQWZUUx5q'
access_token = '4852231552-adGooMpTB3EJYPHvs6oGZ40qlo3d2JbVjqUUWkJ'
access_token_secret = 'm9hgeM9p0r1nn8IoQWJYBs5qUQu56XmrAhsDSYKjuiVA4'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
return api
'''This function is implemented to handle tweepy exception errors
because search is rate limited at 180 queries per 15 minute window by twitter'''
def limit(cursor):
while True:
try:
yield cursor.next()
except tweepy.TweepError as error:
print(repr(error))
print("Twitter Request limit error reached sleeping for 15 minutes")
time.sleep(16*60)
except tweepy.RateLimitError:
print("Rate Limit Error occurred Sleeping for 16 minutes")
time.sleep(16*60)
def retrieveTweets(api, search, lim):
if(lim == ""):
lim = math.inf
else:
lim = int(lim)
text = []
for tweet in limit(tweepy.Cursor(api.search, q=search).items(limit = lim)):
t = re.sub('\s+', ' ', tweet.text)
text.append(t)
data = {"Tweet":text,
"Sentiment":"",
"Score":""}
dataFrame = pd.DataFrame(data, columns=["Tweet","Sentiment","Score"])
return dataFrame
def analyze(al,dataFrame):
sentiment = []
score = []
for i in range(0, dataFrame["Tweet"].__len__()):
res = al.combined(text=dataFrame["Tweet"][i],
extract="doc-sentiment",
sentiment=1)
sentiment.append(res["docSentiment"]["type"])
if(res["docSentiment"]["type"] == "neutral"):
score.append(0)
else:
score.append(res["docSentiment"]["score"])
dataFrame["Sentiment"] = sentiment
dataFrame["Score"] = score
return dataFrame
def main():
#Initialse Twitter Api
api = initTwitterApi()
#Retrieve tweets
dataFrame = retrieveTweets(api,input("Enter the search query (e.g. #hillaryclinton ) : "), input("Enter limit for number of tweets to be searched or else just hit enter : "))
#Initialise IBM Watson Alchemy Language Api
al = initAlchemy()
#Do Document Sentiment analysis
dataFrame = analyze(al, dataFrame)
#Save tweets, sentiment, and score data frame in csv file
dataFrame.to_csv(input("Enter the name of the file (with .csv extension) : "))
if __name__ == '__main__':
main()# -*- coding: utf-8 -*-
The Watson Natural Language Understanding only has a combined call, but since it is the only call, it isn't called combined, its actually analyze. Best place to go for details would be the API documentation - https://www.ibm.com/watson/developercloud/natural-language-understanding/api/v1/?python#post-analyze
I'm building a web app with django. I use postgresql for the db. The app code is getting really messy(my begginer skills being a big factor) and slow, even when I run the app locally.
This is an excerpt of my models.py file:
REPEATS_CHOICES = (
(NEVER, 'Never'),
(DAILY, 'Daily'),
(WEEKLY, 'Weekly'),
(MONTHLY, 'Monthly'),
...some more...
)
class Transaction(models.Model):
name = models.CharField(max_length=30)
type = models.IntegerField(max_length=1, choices=TYPE_CHOICES) # 0 = 'Income' , 1 = 'Expense'
amount = models.DecimalField(max_digits=12, decimal_places=2)
date = models.DateField(default=date.today)
frequency = models.IntegerField(max_length=2, choices=REPEATS_CHOICES)
ends = models.DateField(blank=True, null=True)
active = models.BooleanField(default=True)
category = models.ForeignKey(Category, related_name='transactions', blank=True, null=True)
account = models.ForeignKey(Account, related_name='transactions')
The problem is with date, frequency and ends. With this info I can know all the dates in which transactions occurs and use it to fill a cashflow table. Doing things this way involves creating a lot of structures(dictionaries, lists and tuples) and iterating them a lot. Maybe there is a very simple way of solving this with the actual schema, but I couldn't realize how.
I think that the app would be easier to code if, at the creation of a transaction, I could save all the dates in the db. I don't know if it's possible or if it's a good idea.
I'm reading a book about google app engine and the datastore's multivalued properties. What do you think about this for solving my problem?.
Edit: I didn't know about the PickleField. I'm now reading about it, maybe I could use it to store all the transaction's datetime objects.
Edit2: This is an excerpt of my cashflow2 view(sorry for the horrible code):
def cashflow2(request, account_name="Initial"):
if account_name == "Initial":
uri = "/cashflow/new_account"
return HttpResponseRedirect(uri)
month_info = {}
cat_info = {}
m_y_list = [] # [(month,year),]
trans = []
min, max = [] , []
account = Account.objects.get(name=account_name, user=request.user)
categories = account.categories.all()
for year in range(2006,2017):
for month in range(1,13):
month_info[(month, year)] = [0, 0, 0]
for cat in categories:
cat_info[(cat, month, year)] = 0
previous_months = 1 # previous months from actual
next_months = 5
dates_list = month_year_list(previous_month, next_months) # Returns [(month,year)] from the requested range
m_y_list = [(date.month, date.year) for date in month_year_list(1,5)]
min, max = dates_list[0], dates_list[-1]
INCOME = 0
EXPENSE = 1
ONHAND = 2
transacs_in_dates = []
txs = account.transactions.order_by('date')
for tx in txs:
monthyear = ()
monthyear = (tx.date.month, tx.date.year)
if tx.frequency == 0:
if tx.type == 0:
month_info[monthyear][INCOME] += tx.amount
if tx.category:
cat_info[(tx.category, monthyear[0], monthyear[1])] += tx.amount
else:
month_info[monthyear][EXPENSE] += tx.amount
if tx.category:
cat_info[(tx.category, monthyear[0], monthyear[1])] += tx.amount
if monthyear in lista_m_a:
if tx not in transacs_in_dates:
transacs_in_dates.append(tx)
elif tx.frequency == 4: # frequency = 'Monthly'
months_dif = relativedelta.relativedelta(tx.ends, tx.date).months
if tx.ends.day < tx.date.day:
months_dif += 1
years_dif = relativedelta.relativedelta(tx.ends, tx.date).years
dif = months_dif + (years_dif*12)
dates_range = dif + 1
for i in range(dates_range):
dt = tx.date+relativedelta.relativedelta(months=+i)
if (dt.month, dt.year) in m_y_list:
if tx not in transacs_in_dates:
transacs_in_dates.append(tx)
if tx.type == 0:
month_info[(fch.month,fch.year)][INCOME] += tx.amount
if tx.category:
cat_info[(tx.category, fch.month, fch.year)] += tx.amount
else:
month_info[(fch.month,fch.year)][EXPENSE] += tx.amount
if tx.category:
cat_info[(tx.category, fch.month, fch.year)] += tx.amount
import operator
thelist = []
thelist = sorted((my + tuple(v) for my, v in month_info.iteritems()),
key = operator.itemgetter(1, 0))
thelistlist = []
for atuple in thelist:
thelistlist.append(list(atuple))
for i in range(len(thelistlist)):
if i != 0:
thelistlist[i][4] = thelistlist[i-1][2] - thelistlist[i-1][3] + thelistlist[i-1][4]
list = []
for el in thelistlist:
if (el[0],el[1]) in lista_m_a:
list.append(el)
transactions = account.transactions.all()
cats_in_dates_income = []
cats_in_dates_expense = []
for t in transacs_in_dates:
if t.category and t.type == 0:
if t.category not in cats_in_dates_income:
cats_in_dates_income.append(t.category)
elif t.category and t.type == 1:
if t.category not in cats_in_dates_expense:
cats_in_dates_expense.append(t.category)
cat_infos = []
for k, v in cat_info.items():
cat_infos.append((k[0], k[1], k[2], v))
Depends on how relevant App Engine is here. P.S. If you'd like to store pickled objects as well as JSON objects in the Google Datastore, check out these two code snippets:
http://kovshenin.com/archives/app-engine-json-objects-google-datastore/
http://kovshenin.com/archives/app-engine-python-objects-in-the-google-datastore/
Also note that the Google Datastore is a non-relational database, so you might have other trouble refactoring your code to switch to that.
Cheers and good luck!