I want to know when the merge() method on AggregateFunction gets called. From what I've understood from the answers here and here, is that its applicable to Session Windows only and occurs on every event that can be merged with the previous window since every event for a Session Window create a new Window. I'm using PyFlink and would appreciate any help by providing an example.
Let's take an example that I put together from the documentation for the AverageAggregate function and some custom code:
class MyTimestampAssigner(TimestampAssigner):
def extract_timestamp(self, value, record_timestamp) -> int:
return int(value[1])
class AverageAggregate(AggregateFunction):
def create_accumulator(self) -> Tuple[int, int]:
return 0, 0
def add(self, value: Tuple[str, int], accumulator: Tuple[int, int]) -> Tuple[int, int]:
return accumulator[0] + value[1], accumulator[1] + 1
def get_result(self, accumulator: Tuple[int, int]) -> float:
return accumulator[0] / accumulator[1]
def merge(self, a: Tuple[int, int], b: Tuple[int, int]) -> Tuple[int, int]:
return a[0] + b[0], a[1] + b[1]
if __name__ == '__main__':
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
# define the source
data_stream = env.from_collection([
('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)],
type_info=Types.TUPLE([Types.STRING(), Types.INT()]))
# define the watermark strategy
watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
.with_timestamp_assigner(MyTimestampAssigner())
ds = (
data_stream
.assign_timestamps_and_watermarks(watermark_strategy)
.key_by(lambda x: x[0], key_type=Types.STRING())
.window(EventTimeSessionWindows.with_gap(Time.milliseconds(3)))
.aggregate(AverageAggregate())
)
# print the results
ds.print()
# submit for execution
env.execute()
From my understanding, the merge() method should have run on the second event ('hi', 2) since that is within the window size of 3 ms and then again for the input ('hi', 4) and so on. But while executing the code, the merge() method doesn't even fire once. So if anyone can modify the sample code above to show merge() being executed and explain how it works would be greatly appreciated.
While it's not a direct PyFlink example, you can have a look at the DataStream API recipe at https://docs.immerok.cloud/docs/how-to-guides/development/using-session-windows/#merging-data-in-one-session-window for info on the merge() method.
Disclaimer: I work for Immerok
Related
Tensorflow: convert PrefetchDataset to BatchDataset
With latest Tensorflow version 2.3.1I am trying to follow basic text classification example at: https://www.tensorflow.org/tutorials/keras/text_classification. Instead of creating dataset from directory as example does, I am using a csv file:
SELECT_COLUMNS = ['SentimentText','Sentiment']
LABEL_COLUMN = 'Sentiment'
LABELS = [0, 1]
def get_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=3, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
As a result I get:
type(all_data)
tensorflow.python.data.ops.dataset_ops.PrefetchDataset
Example loads data from directory with:
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
And gets dataset of another type:
type(raw_train_ds)
tensorflow.python.data.ops.dataset_ops.BatchDataset
Now when I try to standardise and vectorise data with functions from example:
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 10000
sequence_length = 250
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
And apply them to my dataset I get error:
# Make a text-only dataset (without labels), then call adapt
train_text = all_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-1f1fc445912d> in <module>
1 # Make a text-only dataset (without labels), then call adapt
2 train_text = all_data.map(lambda x, y: x)
----> 3 vectorize_layer.adapt(train_text)
/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/layers/preprocessing/text_vectorization.py in adapt(self, data, reset_state)
378 shape = dataset_ops.get_legacy_output_shapes(data)
379 if not isinstance(shape, tensor_shape.TensorShape):
--> 380 raise ValueError("The dataset passed to 'adapt' must contain a single "
381 "tensor value.")
382 if shape.rank == 0:
ValueError: The dataset passed to 'adapt' must contain a single tensor value.
How to convert PrefetchDataset to BatchDataset ?
You could use tf.stack method to pack the features into a single array. The below function is from Custom training: walkthrough in Tensorflow.
def pack_features_vector(features, labels):
features = tf.stack(list(features.values()), axis=1)
return features, labels
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
train_dataset = all_data.map(pack_features_vector)
train_text = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
This example is a little contrived. The goal is to create a macro that loops over some values and programmatically generates some code.
A common pattern in Python is to initialize the properties of an object at calling time as follows:
(defclass hair [foo bar]
(defn __init__ [self]
(setv self.foo foo)
(setv self.bar bar)))
This correctly translates with hy2py to
class hair(foo, bar):
def __init__(self):
self.foo = foo
self.bar = bar
return None
I know there are Python approaches to this problem including attr.ib and dataclasses. But as a simplified learning exercise I wanted to approach this with a macro.
This is my non-working example:
(defmacro self-set [&rest args]
(for [[name val] args]
`(setv (. self (read-str ~name)) ~val)))
(defn fur [foo bar]
(defn __init__ [self]
(self-set [["foo" foo] ["bar" bar]])))
But this doesn't expand to the original pattern. hy2py shows:
from hy.core.language import name
from hy import HyExpression, HySymbol
import hy
def _hy_anon_var_1(hyx_XampersandXname, *args):
for [name, val] in args:
HyExpression([] + [HySymbol('setv')] + [HyExpression([] + [HySymbol
('.')] + [HySymbol('self')] + [HyExpression([] + [HySymbol(
'read-str')] + [name])])] + [val])
hy.macros.macro('self-set')(_hy_anon_var_1)
def fur(foo, bar):
def __init__(self, foo, bar):
return None
Wbat am I doing wrong?
for forms always return None. So, your loop is constructing the (setv ...) forms you request and then throwing them away. Instead, try lfor, which returns a list of results, or gfor, which returns a generator. Note also in the below example that I use do to group the generated forms together, and I've moved a ~ so that the read-str happens at compile-time, as it must in order for . to work.
(defmacro self-set [&rest args]
`(do ~#(gfor
[name val] args
`(setv (. self ~(read-str name)) ~val))))
(defclass hair []
(defn __init__ [self]
(self-set ["foo" 1] ["bar" 2])))
(setv h (hair))
(print h.bar) ; 2
I have used the example described here (http://openmdao.readthedocs.org/en/1.5.0/usr-guide/tutorials/doe-drivers.html?highlight=driver) to show my problem. I want to use the same approach for one component were "params" are array and no longer float . See example below
from openmdao.api import IndepVarComp, Group, Problem, ScipyOptimizer, ExecComp, DumpRecorder, Component
from openmdao.drivers.latinhypercube_driver import LatinHypercubeDriver, OptimizedLatinHypercubeDriver
import numpy as np
class Paraboloid(Component):
""" Evaluates the equation f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3 """
def __init__(self):
super(Paraboloid, self).__init__()
self.add_param('x', val=0.0)
self.add_param('y', val=0.0)
self.add_output('f_xy', val=0.0)
def solve_nonlinear(self, params, unknowns, resids):
"""f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3
"""
x = params['x']
y = params['y']
unknowns['f_xy'] = (x-3.0)**2 + x*y + (y+4.0)**2 - 3.0
def linearize(self, params, unknowns, resids):
#""" Jacobian for our paraboloid."""
x = params['x']
y = params['y']
J = {}
J['f_xy', 'x'] = 2.0*x - 6.0 + y
J['f_xy', 'y'] = 2.0*y + 8.0 + x
return J
class ParaboloidArray(Component):
""" Evaluates the equation f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3 """
def __init__(self):
super(ParaboloidArray, self).__init__()
self.add_param('X', val=np.array([0., 0.]))
self.add_output('f_xy', val=0.0)
def solve_nonlinear(self, params, unknowns, resids):
"""f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3
"""
x = params['X'][0]
y = params['y'][1]
unknowns['f_xy'] = (x-3.0)**2 + x*y + (y+4.0)**2 - 3.0
top = Problem()
root = top.root = Group()
root.add('p1', IndepVarComp('x', 50.0), promotes=['*'])
root.add('p2', IndepVarComp('y', 50.0), promotes=['*'])
root.add('comp', Paraboloid(), promotes=['*'])
top.driver = OptimizedLatinHypercubeDriver(num_samples=4, seed=0, population=20, generations=4, norm_method=2)
top.driver.add_desvar('x', lower=-50.0, upper=50.0)
top.driver.add_desvar('y', lower=-50.0, upper=50.0)
top.driver.add_objective('f_xy')
top.setup()
top.run()
top.cleanup()
###########################
print("case float ok")
top = Problem()
root = top.root = Group()
root.add('p1', IndepVarComp('X', np.array([50., 50.])), promotes=['*'])
root.add('comp', ParaboloidArray(), promotes=['*'])
top.driver = OptimizedLatinHypercubeDriver(num_samples=4, seed=0, population=20, generations=4, norm_method=2)
top.driver.add_desvar('X', lower=np.array([-50., -50.]), upper=np.array([50., 50.]))
top.driver.add_objective('f_xy')
top.setup()
top.run()
top.cleanup()
I obtain the following error :
Traceback (most recent call last):
File "C:\Program Files (x86)\Wing IDE 101 5.0\src\debug\tserver\_sandbox.py", line 102, in <module>
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\core\problem.py", line 1038, in run
self.driver.run(self)
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\drivers\predeterminedruns_driver.py", line 108, in run
for run in runlist:
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\drivers\latinhypercube_driver.py", line 57, in _build_runlist
design_var_buckets = self._get_buckets(bounds['lower'], bounds['upper'])
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\drivers\latinhypercube_driver.py", line 101, in _get_buckets
bucket_walls = np.linspace(low, high, self.num_samples + 1)
File "D:\tlefeb\Anaconda2\Lib\site-packages\numpy\core\function_base.py", line 102, in linspace
if step == 0:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Did I misunderstood something in my way of coding ?
I get a different error than you, using the the latest OpenMDAO master, but I get an error non-the-less. There isn't anything wrong with the mode, but rather there are some bugs with using array variables for DOEs. I've added a bug-fix story to the OpenMDAO backlog, which we'll hopefully be able to deal with in the next couple weeks. We'd gladly accept a pull request if you develop a fix before we get to it though.
Is it possible to create an Array from another Array?
Lang: Ruby on Rails
Case
Workers are entitled to fill in their own work hours. Sometimes they forget to do it. This is what I want to tackle. In the end, I want an Array with time codes of periods the worker forgot to register his hours.
timecodes = [201201, 201202, 201203, 201204, 201205, 201206, 201207, 201208, 201209, 201210, 201211, 201212, 201213, 201301, 201302, 201304, 201305, 201306, ...]
Worker works from 201203 to 201209 with us.
timecards = [201203, 201204, 201205, 201207, 201208, 201209]
As you see, he forgot to register 201206.
What I want to do
# Create Array from timecode on start to timecode on end
worked_with_us = [201203, 201204, 201205, 201206, 201207, 201208, 201209]
#=> This is the actual problem, how can I automate this?
forgot_to_register = worked_with_us.?????(timecards)
forgot_to_register = worked_with_us - timecards # Thanks Zwippie
#=> [201206]
Now I know which period the worker forgot to register his hours.
All together
How can I create an Array from another Array, giving a start and end value?
You can just subtract arrays with - (minus):
[1, 2, 3] - [1, 3] = [2]
To build an array with years/months, this can be done with a Range, but this only works if you build an array for each year, something like:
months = (2012..2013).map do |year|
("#{year}01".."#{year}12").to_a.collect(&:to_i)
end.flatten
=> [201201, 201202, 201203, 201204, 201205, 201206, 201207, 201208, 201209, 201210, 201211, 201212, 201301, 201302, 201303, 201304, 201305, 201306, 201307, 201308, 201309, 201310, 201311, 201312]
And for the function to create those ranges dynamically:
def month_array(year_from, year_to, month_from=1, month_to=12)
(year_from..year_to).map do |year|
# Correct from/to months
mf = year_from == year ? month_from : 1
mt = year_to == year ? month_to : 12
(mf..mt).map do |month|
("%d%02d" % [year, month]).to_i
end
end.flatten
end
Update: You wanted other input parameters for this method, but I hope you can work that out yourself. :)
In a ModelForm I can write a clean_<field_name> member function to automatically validate and clean up data entered by a user, but what can I do about dirty json or csv files (fixtures) during a manage.py loaddata?
Fixtures loaded with loaddata are assumed to contain clean data that doen't need validation (usually as an inverse operation to a prior dumpdata), so the short answer is that loaddata isn't the approach you want if you need to clean your inputs.
However, you probably can use some of the underpinnings of loaddata while implementing your custom data cleaning code--I'm sure you can easily script something using the Django serialization libs to read your existing data files them in and the save the resulting objects normally after the data has been cleaned up.
In case others want to do something similar, I defined a model method to do the cleaning (so it can be called from ModelForms)
MAX_ZIPCODE_DIGITS = 9
MIN_ZIPCODE_DIGITS = 5
def clean_zip_code(self, s=None):
#s = str(s or self.zip_code)
if not s: return None
s = re.sub("\D","",s)
if len(s)>self.MAX_ZIPCODE_DIGITS:
s = s[:self.MAX_ZIPCODE_DIGITS]
if len(s) in (self.MIN_ZIPCODE_DIGITS-1,self.MAX_ZIPCODE_DIGITS-1):
s = '0'+s # FIXME: deal with other intermediate lengths
if len(s)>=self.MAX_ZIPCODE_DIGITS:
s = s[:self.MIN_ZIPCODE_DIGITS]+'-'+s[self.MIN_ZIPCODE_DIGITS:]
return s
Then wrote a standalone python script to clean up my legacy json files using any clean_ methods found among the models.
import os, json
def clean_json(app = 'XYZapp', model='Entity', fields='zip_code', cleaner_prefix='clean_'):
# Set the DJANGO_SETTINGS_MODULE environment variable.
os.environ['DJANGO_SETTINGS_MODULE'] = app+".settings"
settings = __import__(app+'.settings').settings
models = __import__(app+'.models').models
fpath = os.path.join( settings.SITE_PROJECT_PATH, 'fixtures', model+'.json')
if isinstance(fields,(str,unicode)):
fields = [fields]
Ns = []
for field in fields:
try:
instance = getattr(models,model)()
except AttributeError:
print 'No model named %s could be found'%(model,)
continue
try:
cleaner = getattr(instance, cleaner_prefix+field)
except AttributeError:
print 'No cleaner method named %s.%s could be found'%(model,cleaner_prefix+field)
continue
print 'Cleaning %s using %s.%s...'%(fpath,model,cleaner.__name__)
fin = open(fpath,'r')
if fin:
l = json.load(fin)
before = len(l)
cleans = 0
for i in range(len(l)):
if 'fields' in l[i] and field in l[i]['fields']:
l[i]['fields'][field]=cleaner(l[i]['fields'][field]) # cleaner returns None to delete records
cleans += 1
fin.close()
after = len(l)
assert after>.5*before
Ns += [(before, after,cleans)]
print 'Writing %d/%d (new/old) records after %d cleanups...'%Ns[-1]
with open(fpath,'w') as fout:
fout.write(json.dumps(l,indent=2,sort_keys=True))
return Ns
if __name__ == '__main__':
clean_json()