Flink ValueState "Error while adding data to RocksDB" - apache-flink

When i want update value state(queueState.update(queue)) catch this exception:
org.apache.flink.util.FlinkRuntimeException: Error while adding data to RocksDB
at org.apache.flink.contrib.streaming.state.RocksDBValueState.update(RocksDBValueState.java:108)
at xxx.xxx.xxx.CleanTimedOutPartialMatches.processElement(CleanTimedOutPartialMatches.java:37)
at xxx.xxx.xxx.CleanTimedOutPartialMatches.processElement(CleanTimedOutPartialMatches.java:22)
at org.apache.flink.streaming.api.operators.KeyedProcessOperator.processElement(KeyedProcessOperator.java:85)
at org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processElement(StreamOneInputProcessor.java:164)
2019-10-13 11:06:29,311 WARN org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor - Timestamp monotony violated: 1570948458514 < 1570948663062
at org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processInput(StreamOneInputProcessor.java:143)
at org.apache.flink.streaming.runtime.tasks.StreamTask.performDefaultAction(StreamTask.java:276)
at org.apache.flink.streaming.runtime.tasks.StreamTask.run(StreamTask.java:298)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:403)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: The Kryo Output still contains data from a previous serialize call. It has to be flushed or cleared at the end of the serialize call.
at org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer.serialize(KryoSerializer.java:300)
at org.apache.flink.contrib.streaming.state.AbstractRocksDBState.serializeValueInternal(AbstractRocksDBState.java:158)
at org.apache.flink.contrib.streaming.state.AbstractRocksDBState.serializeValue(AbstractRocksDBState.java:178)
at org.apache.flink.contrib.streaming.state.AbstractRocksDBState.serializeValue(AbstractRocksDBState.java:167)
at org.apache.flink.contrib.streaming.state.RocksDBValueState.update(RocksDBValueState.java:106)
... 11 more
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.JsonNode;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.flink.streaming.api.TimerService;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Comparator;
import java.util.PriorityQueue;
public class CleanTimedOutPartialMatches extends KeyedProcessFunction<String, ObjectNode, ObjectNode> {
private static Logger LOGGER = LoggerFactory.getLogger(CleanTimedOutPartialMatches.class);
private ValueState<PriorityQueue<JsonNode>> queueState = null;
#Override
public void processElement(ObjectNode log, Context context, Collector<ObjectNode> collector) throws Exception {
try {
if (context.timestamp() > context.timerService().currentWatermark()) {
PriorityQueue<JsonNode> queue = queueState.value();
if (queue == null) {
queue = new PriorityQueue<JsonNode>(Comparator.comparingLong(o -> o.get(TS).longValue()));
}
queue.add(log);
queueState.update(queue);
context.timerService().registerEventTimeTimer(log.get(TS).longValue());
}
} catch (Exception e){
e.printStackTrace();
}
}
#Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<ObjectNode> out) throws Exception {
try {
sendToSink(queueState.value(), ctx, out);
} catch (Exception e){
for(StackTraceElement el : e.getStackTrace()){
LOGGER.info("{}.{}:{}", el.getClassName(), el.getMethodName(), el.getLineNumber());
}
}
}
private void sendToSink(PriorityQueue<JsonNode> queue, OnTimerContext context, Collector<ObjectNode> out){
long watermark = context.timerService().currentWatermark();
JsonNode lastSentLog = null;
JsonNode log = queue.peek();
while (log != null && log.get(TS).longValue() <= watermark) {
if(lastSentLog != null && extractLogEndpoint(log).equals(extractLogEndpoint(lastSentLog)) && log.get(TS).longValue() == lastSentLog.get(TS).longValue()){
LOGGER.info("duplicated log removed");
} else {
if(lastSentLog != null){
long gapTime = Math.abs(log.get(TS).longValue() - lastSentLog.get(TS).longValue()) / 1000;
boolean isSameAttempt = (extractLogEndpoint(lastSentLog).equals(AUTOCOMPLETE) && extractLogEndpoint(log).equals(LOG))
|| (extractLogEndpoint(log).equals(extractLogEndpoint(lastSentLog)) && gapTime < MAX_TIME_GAP);
if(isSameAttempt){
((ObjectNode)log).put(ATTEMPT_ID, lastSentLog.get(ATTEMPT_ID).textValue());
}
}
lastSentLog = log;
out.collect((ObjectNode)log);
}
queue.remove(log);
log = queue.peek();
}
}
#Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<PriorityQueue<JsonNode>> descriptor = new ValueStateDescriptor<>(
// state name
"sort-partial-matches",
// type information of state
TypeInformation.of(new TypeHint<PriorityQueue<JsonNode>>() {
}));
queueState = getRuntimeContext().getState(descriptor);
}
}

One problem: it looks like you forgot to call queueState.update(queue) after you are done removing things from the queue.
Even if you do get this working, sorting based on a PriorityQueue with RocksDB as the state backend is going to perform very poorly, as it will have to go through ser/de of the entire queue on every access and update. It's recommended to use MapState for sorting, unless you are using one of the heap-based state backends, because this only has to do ser/de on individual entries, rather than the entire map. You can use the timestamps as keys for the MapState, and a List of objects as the values. Use timers just as you are doing now to trigger flushing out the contents of the List.
Or you could use SQL to do the sorting -- see the answer to this question for an example.

Related

Check pointing is not working with dynamodb streams record in flink

How do a checkpoint the processed records in apache flink? same messages are being consumed at regular intervals.
Do I need to explicitly checkpoint each message post consumption?
I can see the eventId and sequenceNumber are matching for multiple messages being consumed.
It seems the checkpointing is not done and so same messages are retrieved from steams at regular intervals.
Here is the code
package com.flink.basics;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.kinesis.shaded.com.amazonaws.services.dynamodbv2.model.AttributeValue;
import org.apache.flink.kinesis.shaded.com.amazonaws.services.dynamodbv2.model.Record;
import org.apache.flink.kinesis.shaded.com.amazonaws.services.kinesis.clientlibrary.lib.worker.PreparedCheckpointer;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.sink.DiscardingSink;
import org.apache.flink.streaming.connectors.kinesis.FlinkDynamoDBStreamsConsumer;
import org.apache.flink.streaming.connectors.kinesis.config.AWSConfigConstants;
import org.apache.flink.streaming.connectors.kinesis.config.ConsumerConfigConstants;
import org.apache.flink.streaming.connectors.kinesis.serialization.DynamoDBStreamsSchema;
import org.apache.flink.util.Collector;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.Properties;
public class DynamoDbConsumer {
public static void main(String[] args) throws Exception {
Properties consumerConfig = new Properties();
consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
consumerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id");
consumerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key");
consumerConfig.put(AWSConfigConstants.AWS_ENDPOINT, "http://localhost:4566");
consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST");
System.setProperty("com.amazonaws.sdk.disableCbor", "true");
System.setProperty("org.apache.flink.kinesis.shaded.com.amazonaws.sdk.disableCbor", "true");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(1000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
// File based Backend
env.setStateBackend(new FsStateBackend(Paths.get("/Users/polimea/flink-basics/stbackend").toUri(), false));
FlinkDynamoDBStreamsConsumer<Record> flinkConsumer = new FlinkDynamoDBStreamsConsumer<Record>(
Collections.singletonList("arn:aws:dynamodb:us-east-1:000000000000:table/FDXTable/stream/2022-05-24T00:18:12.500"),
new DynamoDBStreamsSchema(), consumerConfig);
DataStream<Record> kinesisDBStream = env.addSource(flinkConsumer);
KeyedStream<Record, String> snapshotKeyedStream = kinesisDBStream.keyBy((KeySelector<Record, String>)
record -> record.getDynamodb().getNewImage().get("SNP").getS());
SingleOutputStreamOperator<Tuple2<String, Record>> records = snapshotKeyedStream.process(new StatefulReduceFunc());
records.print();
records.addSink(new DiscardingSink<>());
snapshotKeyedStream.process(new KeyedProcessFunction<String, Record, Object>() {
#Override
public void processElement(Record record, KeyedProcessFunction<String, Record, Object>.Context context,
Collector<Object> collector) throws Exception {
}
});
// kinesisDBStream.print();
env.execute("Stream for buffering dynamodb records till snapshot is committed");
}
private static class StatefulReduceFunc extends KeyedProcessFunction<String, Record, Tuple2<String, Record>> {
private transient ListState<Record> records;
public void open(Configuration parameters) {
ListStateDescriptor<Record> listStateDescriptor =
new ListStateDescriptor<>("records", Record.class);
records = getRuntimeContext().getListState(listStateDescriptor);
}
#Override
public void processElement(Record record, Context context,
Collector<Tuple2<String, Record>> collector) throws Exception {
Iterable<Record> recordIterator = this.records.get();
AttributeValue snCommitted = record.getDynamodb().getNewImage().get("SNCommitted");
if (snCommitted != null && snCommitted.getBOOL()) {
for (Record recordInList : recordIterator) {
collector.collect(new Tuple2<>(record.getDynamodb().getNewImage().get("SNP").getS(), recordInList));
}
} else {
records.add(record);
}
}
}
}
Not sure if this is related to your issue but the code you provided will buffer the records forever. I think what you want is to emit records and clear the state once commit message comes. Something along those lines
// ...
if (snCommitted != null && snCommitted.getBOOL()) {
var snp = record.getDynamodb().getNewImage().get("SNP").getS();
for (Record recordInList : recordIterator) {
collector.collect(new Tuple2<>(snp, recordInList));
}
// explicitly clear the buffer not to emit same events over and over again
records.clear();
}
// ...

Flink Get the KeyedState State Value and use in Another Stream

I know that keyed state belongs to the its key and only current key accesses its state value, other keys can not access to the different key's state value.
I tried to access the state with the same key but in different stream. Is it possible?
If it is not possible then I will have 2 duplicate data?
Not: I need two stream because each of them will have different timewindow and also different implementations.
Here is the example (I know that keyBy(sommething) is the same for both stream operations):
public class Sample{
streamA
.keyBy(something)
.timeWindow(Time.seconds(4))
.process(new CustomMyProcessFunction())
.name("CustomMyProcessFunction")
.print();
streamA
.keyBy(something)
.timeWindow(Time.seconds(1))
.process(new CustomMyAnotherProcessFunction())
.name("CustomMyProcessFunction")
.print();
}
public class CustomMyProcessFunction extends ProcessWindowFunction<..>
{
private Logger logger = LoggerFactory.getLogger(CustomMyProcessFunction.class);
private transient ValueState<SimpleEntity> simpleEntityValueState;
private SimpleEntity simpleEntity;
#Override
public void open(Configuration parameters) throws Exception
{
ValueStateDescriptor<SimpleEntity> simpleEntityValueStateDescriptor = new ValueStateDescriptor<SimpleEntity>(
"sample",
TypeInformation.of(SimpleEntity.class)
);
simpleEntityValueState = getRuntimeContext().getState(simpleEntityValueStateDescriptor);
}
#Override
public void process(...) throws Exception
{
SimpleEntity value = simpleEntityValueState.value();
if (value == null)
{
SimpleEntity newVal = new SimpleEntity("sample");
logger.info("New Value put");
simpleEntityValueState.update(newVal);
}
...
}
...
}
public class CustomMyAnotherProcessFunction extends ProcessWindowFunction<..>
{
private transient ValueState<SimpleEntity> simpleEntityValueState;
#Override
public void open(Configuration parameters) throws Exception
{
ValueStateDescriptor<SimpleEntity> simpleEntityValueStateDescriptor = new ValueStateDescriptor<SimpleEntity>(
"sample",
TypeInformation.of(SimpleEntity.class)
);
simpleEntityValueState = getRuntimeContext().getState(simpleEntityValueStateDescriptor);
}
#Override
public void process(...) throws Exception
{
SimpleEntity value = simpleEntityValueState.value();
if (value != null)
logger.info(value.toString()); // I expect that SimpleEntity("sample")
out.collect(...);
}
...
}
As has been pointed out already, state is always local to a single operator instance. It cannot be shared.
What you can do, however, is stream the state updates from the operator holding the state to other operators that need it. With side outputs you can create complex dataflows without needing to share state.
I tried with your idea to share state between two operators using same key.
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.io.IOException;
public class FlinkReuseState {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
DataStream<Integer> stream1 = env.addSource(new SourceFunction<Integer>() {
#Override
public void run(SourceContext<Integer> sourceContext) throws Exception {
int i = 0;
while (true) {
sourceContext.collect(1);
Thread.sleep(1000);
}
}
#Override
public void cancel() {
}
});
DataStream<Integer> stream2 = env.addSource(new SourceFunction<Integer>() {
#Override
public void run(SourceContext<Integer> sourceContext) throws Exception {
while (true) {
sourceContext.collect(1);
Thread.sleep(1000);
}
}
#Override
public void cancel() {
}
});
DataStream<Integer> windowedStream1 = stream1.keyBy(Integer::intValue)
.timeWindow(Time.seconds(3))
.process(new ProcessWindowFunction<Integer, Integer, Integer, TimeWindow>() {
private ValueState<Integer> value;
#Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
ValueStateDescriptor<Integer> desc = new ValueStateDescriptor<Integer>("value", Integer.class);
value = getRuntimeContext().getState(desc);
}
#Override
public void process(Integer integer, Context context, Iterable<Integer> iterable, Collector<Integer> collector) throws Exception {
iterable.forEach(x -> {
try {
if (value.value() == null) {
value.update(1);
} else {
value.update(value.value() + 1);
}
} catch (IOException e) {
e.printStackTrace();
}
});
collector.collect(value.value());
}
});
DataStream<String> windowedStream2 = stream2.keyBy(Integer::intValue)
.timeWindow(Time.seconds(3))
.process(new ProcessWindowFunction<Integer, String, Integer, TimeWindow>() {
private ValueState<Integer> value;
#Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
ValueStateDescriptor<Integer> desc = new ValueStateDescriptor<Integer>("value", Integer.class);
value = getRuntimeContext().getState(desc);
}
#Override
public void process(Integer s, Context context, Iterable<Integer> iterable, Collector<String> collector) throws Exception {
iterable.forEach(x -> {
try {
if (value.value() == null) {
value.update(1);
} else {
value.update(value.value() + 1);
}
} catch (IOException e) {
e.printStackTrace();
}
});
collector.collect(String.valueOf(value.value()));
}
});
windowedStream2.print();
windowedStream1.print();
env.execute();
}
}
It doesn't work, each stream only update its own value state, the output is listed below.
3> 3
3> 3
3> 6
3> 6
3> 9
3> 9
3> 12
3> 12
3> 15
3> 15
3> 18
3> 18
3> 21
3> 21
3> 24
3> 24
keyed state
Based on the official docs, *Each keyed-state is logically bound to a unique composite of <parallel-operator-instance, key>, and since each key “belongs” to exactly one parallel instance of a keyed operator, we can think of this simply as <operator, key>*.
I think it is not possible to share state by giving same name to states in different operators.
Have u tried coprocess function? By doing so, you can also implement two proccess funcs for each stream, the only problem will be the timewindow then. Can you provide more details about your process logic?
Why cant you return the state as part of map operation and that stream can be used to connect to other stream

Flink How to Write DataSet As Parquet files in S3?

How to write DataSet as Parquet files in s3 bucket using Flink. Is there any direct function like spark : DF.write.parquet("write in parquet")
Please help me how to write flink Dataset in parquet format.
I am stuck when trying to convert my DataSet to (Void,GenericRecord)
DataSet<Tuple2<Void,GenericRecord>> df = allEvents.flatMap(new FlatMapFunction<Tuple2<LongWritable, Text>, Tuple2<Void, GenericRecord>>() {
#Override
public void flatMap(Tuple2<LongWritable, Text> longWritableTextTuple2, Collector<Tuple2<Void, GenericRecord>> collector) throws Exception {
JsonAvroConverter converter = new JsonAvroConverter();
Schema schema = new Schema.Parser().parse(new File("test.avsc"));
try {
GenericRecord record = converter.convertToGenericDataRecord(longWritableTextTuple2.f1.toString().getBytes(), schema);
collector.collect( new Tuple2<Void,GenericRecord>(null,record));
}
catch (Exception e) {
System.out.println("error in converting to avro")
}
}
});
Job job = Job.getInstance();
HadoopOutputFormat parquetFormat = new HadoopOutputFormat<Void, GenericRecord>(new AvroParquetOutputFormat(), job);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
df.output(parquetFormat);
env.execute();
Please help me with what I am doing wrong. I am getting Exception and this
code is not working.
It's a little more complicated than that with Spark. The only way I was able to read and write Parquet data in Flink is through Hadoop & MapReduce compatibility. You need hadoop-mapreduce-client-core and flink-hadoop-compatibility in Your dependencies.
Then You need to create a proper HadoopOutoutFormat. You need to do something like this:
val job = Job.getInstance()
val hadoopOutFormat = new hadoop.mapreduce.HadoopOutputFormat[Void, SomeType](new AvroParquetOutputFormat(), job)
FileOutputFormat.setOutputPath(job, [somePath])
And then You can do:
dataStream.writeUsingOutputFormat(hadoopOutFormat)
You didn't say which exception you are getting but here is a complete example on how to achieve this.
The main points are:
Use org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat
From dependency org.apache.flink:flink-hadoop-compatibility_2.11:1.11.0
HadoopOutputFormat is an adapter that allows you to use output formats developed for Hadoop
You need a DataSet<Tuple2<Void,IndexedRecord>>, because hadoop's OutputFormat<K,V> works with key-value pairs, the key we are not interested in so we use Void for the key type, and the value needs to be an Avro's IndexedRecord or GenericRecord.
Use org.apache.parquet.avro.AvroParquetOutputFormat<IndexedRecord>
From dependency org.apache.parquet:parquet-avro:1.11.1
This hadoop's OutputFormat produces Parquet
This inherits from org.apache.parquet.hadoop.FileOutputFormat<Void, IndexedRecord>
Create your own subclass of IndexedRecord
You can't use new GenericData.Record(schema) because a record like this is no serializable java.io.NotSerializableException: org.apache.avro.Schema$Field is not serializable and Flink requires it to be serializable.
You still need to provide a getSchema() method, but you can either return null or return a Schema that you hold in a static member (so that it doesn't need to be serialized, and you avoid the java.io.NotSerializableException: org.apache.avro.Schema$Field is not serializable)
The source code
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.parquet.avro.AvroParquetOutputFormat;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class MyParquetTest implements Serializable {
public static void main(String[] args) throws Exception {
new MyParquetTest().start();
}
private void start() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
Configuration parameters = new Configuration();
Stream<String> stringStream = IntStream.range(1, 100).mapToObj(n -> String.format("Entry %d", n));
DataSet<String> text = env.fromCollection(stringStream.collect(Collectors.toCollection(ArrayList::new)));
Job job = Job.getInstance();
HadoopOutputFormat<Void, IndexedRecord> hadoopOutputFormat = new HadoopOutputFormat<>(new AvroParquetOutputFormat<IndexedRecord>(), job);
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, CompressionCodecName.SNAPPY.getHadoopCompressionCodecClass());
FileOutputFormat.setOutputPath(job, new org.apache.hadoop.fs.Path("./my-parquet"));
final Schema schema = new Schema.Parser().parse(MyRecord.class.getClassLoader().getResourceAsStream("schema.avsc"));
AvroParquetOutputFormat.setSchema(job, schema);
DataSet<Tuple2<Void, IndexedRecord>> text2 = text.map(new MapFunction<String, Tuple2<Void, IndexedRecord>>() {
#Override
public Tuple2<Void, IndexedRecord> map(String value) throws Exception {
return Tuple2.of(null, new MyRecord(value));
// IndexedRecord record = new GenericData.Record(schema); // won't work becuase Schema$Field is not serializable
// record.put(0, value);
// return Tuple2.of(null, record);
}
});
text2.output(hadoopOutputFormat);
env.execute("Flink Batch Java API Skeleton");
}
public static class MyRecord implements IndexedRecord {
private static Schema schema;
static {
try {
schema = new Schema.Parser().parse(MyRecord.class.getClassLoader().getResourceAsStream("schema.avsc"));
} catch (IOException e) {
e.printStackTrace();
}
}
private final String value;
public MyRecord(String value) {
this.value= value;
}
#Override
public void put(int i, Object v) {
throw new NotImplementedException("You can't update this IndexedRecord");
}
#Override
public Object get(int i) {
return this.value;
}
#Override
public Schema getSchema() {
return schema; // or just return null and remove the schema member
}
}
}
The schema.avsc is simply
{
"name": "aa",
"type": "record",
"fields": [
{"name": "value", "type": "string"}
]
}
and the dependencies:
implementation "org.apache.flink:flink-java:${flinkVersion}"
implementation "org.apache.flink:flink-avro:${flinkVersion}"
implementation "org.apache.flink:flink-streaming-java_${scalaBinaryVersion}:${flinkVersion}"
implementation "org.apache.flink:flink-hadoop-compatibility_${scalaBinaryVersion}:${flinkVersion}"
implementation "org.apache.parquet:parquet-avro:1.11.1"
implementation "org.apache.hadoop:hadoop-client:2.8.3"
You'll create a Flink OutputFormat via new HadoopOutputFormat(parquetOutputFormat, job), and then pass that to DataSet.output(xxx).
The job comes from...
import org.apache.hadoop.mapreduce.Job;
...
Job job = Job.getInstance();
The parquetOutputFormat is created via:
import org.apache.parquet.hadoop.ParquetOutputFormat;
...
ParquetOutputFormat<MyOutputType> parquetOutputFormat = new ParquetOutputFormat<>();
See https://javadoc.io/doc/org.apache.parquet/parquet-hadoop/1.10.1/org/apache/parquet/hadoop/ParquetOutputFormat.html

How to sort the union datastream of flink without watermark

The flink flow has multi data stream, then I merge those data stream with org.apache.flink.streaming.api.datastream.DataStream#union method.
Then, I got the problem, the datastream is disordered and I can not set window to sort the data in data stream.
Sorting union of streams to identify user sessions in Apache Flink
I got the the answer, but the com.liam.learn.flink.example.union.UnionStreamDemo.SortFunction#onTimer
never been invoked.
Environment Info: flink version 1.7.0
In general, I hope to sort the union datastream witout watermark.
You need watermarks so that the sorting function knows when it can safely emit sorted elements. Without watermarks, you get get an record from stream B that has an earlier date than any of the first N records of stream A, right?
But adding watermarks is easy, especially if you know that "event time" is strictly increasing for any one stream. Below is some code I wrote that extends what David Anderson posted in his answer to the other SO issue you referenced above - hopefully this will get you started.
-- Ken
package com.scaleunlimited.flinksnippets;
import java.util.PriorityQueue;
import java.util.Random;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.TimerService;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.util.Collector;
import org.junit.Test;
public class MergeAndSortStreamsTest {
#Test
public void testMergeAndSort() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(2);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<Event> streamA = env.addSource(new EventSource("A"))
.assignTimestampsAndWatermarks(new EventTSWAssigner());
DataStream<Event> streamB = env.addSource(new EventSource("B"))
.assignTimestampsAndWatermarks(new EventTSWAssigner());
streamA.union(streamB)
.keyBy(r -> r.getKey())
.process(new SortByTimestampFunction())
.print();
env.execute();
}
private static class Event implements Comparable<Event> {
private String _label;
private long _timestamp;
public Event(String label, long timestamp) {
_label = label;
_timestamp = timestamp;
}
public String getLabel() {
return _label;
}
public void setLabel(String label) {
_label = label;
}
public String getKey() {
return "1";
}
public long getTimestamp() {
return _timestamp;
}
public void setTimestamp(long timestamp) {
_timestamp = timestamp;
}
#Override
public String toString() {
return String.format("%s # %d", _label, _timestamp);
}
#Override
public int compareTo(Event o) {
return Long.compare(_timestamp, o._timestamp);
}
}
#SuppressWarnings("serial")
private static class EventTSWAssigner extends AscendingTimestampExtractor<Event> {
#Override
public long extractAscendingTimestamp(Event element) {
return element.getTimestamp();
}
}
#SuppressWarnings("serial")
private static class SortByTimestampFunction extends KeyedProcessFunction<String, Event, Event> {
private ValueState<PriorityQueue<Event>> queueState = null;
#Override
public void open(Configuration config) {
ValueStateDescriptor<PriorityQueue<Event>> descriptor = new ValueStateDescriptor<>(
// state name
"sorted-events",
// type information of state
TypeInformation.of(new TypeHint<PriorityQueue<Event>>() {
}));
queueState = getRuntimeContext().getState(descriptor);
}
#Override
public void processElement(Event event, Context context, Collector<Event> out) throws Exception {
TimerService timerService = context.timerService();
long currentWatermark = timerService.currentWatermark();
System.out.format("processElement called with watermark %d\n", currentWatermark);
if (context.timestamp() > currentWatermark) {
PriorityQueue<Event> queue = queueState.value();
if (queue == null) {
queue = new PriorityQueue<>(10);
}
queue.add(event);
queueState.update(queue);
timerService.registerEventTimeTimer(event.getTimestamp());
}
}
#Override
public void onTimer(long timestamp, OnTimerContext context, Collector<Event> out) throws Exception {
PriorityQueue<Event> queue = queueState.value();
long watermark = context.timerService().currentWatermark();
System.out.format("onTimer called with watermark %d\n", watermark);
Event head = queue.peek();
while (head != null && head.getTimestamp() <= watermark) {
out.collect(head);
queue.remove(head);
head = queue.peek();
}
}
}
#SuppressWarnings("serial")
private static class EventSource extends RichParallelSourceFunction<Event> {
private String _prefix;
private transient Random _rand;
private transient boolean _running;
private transient int _numEvents;
public EventSource(String prefix) {
_prefix = prefix;
}
#Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
_rand = new Random(_prefix.hashCode() + getRuntimeContext().getIndexOfThisSubtask());
}
#Override
public void cancel() {
_running = false;
}
#Override
public void run(SourceContext<Event> context) throws Exception {
_running = true;
_numEvents = 0;
long timestamp = System.currentTimeMillis() + _rand.nextInt(10);
while (_running && (_numEvents < 100)) {
long deltaTime = timestamp - System.currentTimeMillis();
if (deltaTime > 0) {
Thread.sleep(deltaTime);
}
context.collect(new Event(_prefix, timestamp));
_numEvents++;
// Generate a timestamp every 5...15 ms, average is 10.
timestamp += (5 + _rand.nextInt(10));
}
}
}
}

How to filter Apache flink stream on the basis of other?

I have two stream one is of Int and other is of json .In The json Schema there is one key which is some int .So i need to filter the json stream via key comparison with the other integer stream so Is it possible in Flink?
Yes, you can do this kind of stream processing with Flink. The basic building blocks you need from Flink are connected streams, and stateful functions -- here's an example using a RichCoFlatMap:
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.RichCoFlatMapFunction;
import org.apache.flink.util.Collector;
public class Connect {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Event> control = env.fromElements(
new Event(17),
new Event(42))
.keyBy("key");
DataStream<Event> data = env.fromElements(
new Event(2),
new Event(42),
new Event(6),
new Event(17),
new Event(8),
new Event(42)
)
.keyBy("key");
DataStream<Event> result = control
.connect(data)
.flatMap(new MyConnectedStreams());
result.print();
env.execute();
}
static final class MyConnectedStreams
extends RichCoFlatMapFunction<Event, Event, Event> {
private ValueState<Boolean> seen = null;
#Override
public void open(Configuration config) {
ValueStateDescriptor<Boolean> descriptor = new ValueStateDescriptor<>(
// state name
"have-seen-key",
// type information of state
TypeInformation.of(new TypeHint<Boolean>() {
}));
seen = getRuntimeContext().getState(descriptor);
}
#Override
public void flatMap1(Event control, Collector<Event> out) throws Exception {
seen.update(Boolean.TRUE);
}
#Override
public void flatMap2(Event data, Collector<Event> out) throws Exception {
if (seen.value() == Boolean.TRUE) {
out.collect(data);
}
}
}
public static final class Event {
public Event() {
}
public Event(int key) {
this.key = key;
}
public int key;
public String toString() {
return String.valueOf(key);
}
}
}
In this example, only those keys that have been seen on the control stream are passed through the data stream -- all other events are filtered out. I've taken advantage of Flink's managed keyed state and connected streams.
To keep this simple I've ignored your requirement that the data stream has JSON, but you can find examples of how to work with JSON and Flink elsewhere.
Note that your results will be non-deterministic, since you have no control over the timing of the two streams relative to one another. You could manage this by adding event-time timestamps to the streams, and then using a RichCoProcessFunction instead.

Resources