Flink AggregateFunction find sum by multiple keys( validation process and testing) - apache-flink

I am using Apache flink on Kinesis Data Analytics.
Flink Version : 1.13.2
Jave : 1.11
I am consuming json messages from kafka. Sample Input records look as below
null {"plateNumber":"506b9910-74a7-4c3e-a885-b5e9717efe3a","vignetteStickerId":"9e69df3f-d728-4fc8-9b09-42104588f772","currentTimestamp":"2022/04/07 16:19:55","timestamp":1649362795.444459000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"5ffe0326-571e-4b97-8f7b-4f49aebb6993","vignetteStickerId":"6c2e1342-b096-4cc9-a92c-df61571c2c7d","currentTimestamp":"2022/04/07 16:20:00","timestamp":1649362800.638060000,"vehicleType":"CAR","vehicleModelType":"HONDA"}
null {"plateNumber":"d15f49f9-5550-4780-b260-83f3116ba64a","vignetteStickerId":"1366fbfe-7d0a-475f-9249-261ef1dd6de2","currentTimestamp":"2022/04/07 16:20:05","timestamp":1649362805.643749000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"803508fb-9701-438e-9028-01bb8d96a804","vignetteStickerId":"b534369f-533e-4c15-ac3f-fc28cf0f3aba","currentTimestamp":"2022/04/07 16:20:10","timestamp":1649362810.648813000,"vehicleType":"CAR","vehicleModelType":"FORD"}
I want to aggregate sum these records into 20 seconds window using vehicleType (CAR OR TRUCK) and vehicleModelType (TOYOTA,HONDA or FORD) . SQL Analogy (sum() ,Group by vehicleType, vehicleModelType)
I am using Aggregate function to achieve this.
import static java.util.Objects.isNull;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.springframework.stereotype.Component;
import com.helecloud.streams.demo.model.Vehicle;
import com.helecloud.streams.demo.model.VehicleStatistics;
#Component
public class VehicleStatisticsAggregator implements AggregateFunction<Vehicle, VehicleStatistics, VehicleStatistics> {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public VehicleStatistics createAccumulator() {
System.out.println("Creating Accumulator!!");
return new VehicleStatistics();
}
#Override
public VehicleStatistics add(Vehicle vehicle, VehicleStatistics vehicleStatistics) {
System.out.println("vehicle in add method : " + vehicle);
if (isNull(vehicleStatistics.getVehicleType())) {
vehicleStatistics.setVehicleType(vehicle.getVehicleType());
}
if (isNull(vehicleStatistics.getVehicleModelType())) {
vehicleStatistics.setVehicleModelType(vehicle.getVehicleModelType());
}
// if(isNull(vehicleStatistics.getStart())) {
//
// vehicleStatistics.setStart(vehicle.getTimestamp());
// }
// if(isNull(vehicleStatistics.getCurrentTimestamp())) {
//
// vehicleStatistics.setCurrentTimestamp(vehicle.getCurrentTimestamp());
// }
if (isNull(vehicleStatistics.getCount())) {
vehicleStatistics.setCount(1);
} else {
System.out.println("incrementing count for : vehicleStatistics : " + vehicleStatistics);
vehicleStatistics.setCount(vehicleStatistics.getCount() + 1);
}
vehicleStatistics.setEnd(vehicle.getTimestamp());
System.out.println("vehicleStatistics in add : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics getResult(VehicleStatistics vehicleStatistics) {
System.out.println("vehicleStatistics in getResult : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics merge(VehicleStatistics vehicleStatistics, VehicleStatistics accumulator) {
System.out.println("Coming to merge!!");
VehicleStatistics vs = new VehicleStatistics(
// vehicleStatistics.getStart(),
accumulator.getEnd(),
// vehicleStatistics.getCurrentTimestamp(),
vehicleStatistics.getVehicleType(), vehicleStatistics.getVehicleModelType(),
vehicleStatistics.getCount() + accumulator.getCount());
System.out.println("VehicleStatistics in Merge :" + vs);
return vs;
}
}
In the above code I am also not seeing the merge code being called.
Below is the main processing code
#Service
public class ProcessingService {
#Value("${kafka.bootstrap-servers}")
private String kafkaAddress;
#Value("${kafka.group-id}")
private String kafkaGroupId;
public static final String TOPIC = "flink_input";
public static final String VEHICLE_STATISTICS_TOPIC = "flink_output";
#Autowired
private VehicleDeserializationSchema vehicleDeserializationSchema;
#Autowired
private VehicleStatisticsSerializationSchema vehicleStatisticsSerializationSchema;
#PostConstruct
public void startFlinkStreamProcessing() {
try {
processVehicleStatistic();
} catch (Exception e) {
// log.error("Cannot process", e);
e.printStackTrace();
}
}
public void processVehicleStatistic() {
try {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
FlinkKafkaConsumer<Vehicle> consumer = createVehicleConsumerForTopic(TOPIC, kafkaAddress, kafkaGroupId);
consumer.setStartFromLatest();
System.out.println("Starting to consume!!");
consumer.assignTimestampsAndWatermarks(WatermarkStrategy.forMonotonousTimestamps());
FlinkKafkaProducer<VehicleStatistics> producer = createVehicleStatisticsProducer(VEHICLE_STATISTICS_TOPIC, kafkaAddress);
DataStream<Vehicle> inputMessagesStream = environment.addSource(consumer);
inputMessagesStream
.keyBy((vehicle -> vehicle.getVehicleType().ordinal()))
// .keyBy(vehicle -> vehicle.getVehicleModelType().ordinal())
// .keyBy(new KeySelector<Vehicle, Tuple2<VehicleType, VehicleModelType>>() {
//
// /**
// *
// */
// private static final long serialVersionUID = 1L;
//
// #Override
// public Tuple2<VehicleType, VehicleModelType> getKey(Vehicle vehicle) throws Exception {
// return Tuple2.of(vehicle.getVehicleType(), vehicle.getVehicleModelType());
// }
// })
// .filter(v -> CAR.equals(v.getVehicleType()))
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
// .windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.aggregate(new VehicleStatisticsAggregator())
.addSink(producer);
System.out.println("Adding to Sink!!");
environment.execute("Car Truck Counts By Model");
} catch(Exception e) {
e.printStackTrace();;
}
}
private FlinkKafkaConsumer<Vehicle> createVehicleConsumerForTopic(String topic, String kafkaAddress, String kafkaGroup ) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", kafkaAddress);
properties.setProperty("group.id", kafkaGroup);
return new FlinkKafkaConsumer<>(topic, vehicleDeserializationSchema, properties);
}
private FlinkKafkaProducer<VehicleStatistics> createVehicleStatisticsProducer(String topic, String kafkaAddress){
return new FlinkKafkaProducer<>(kafkaAddress, topic, vehicleStatisticsSerializationSchema);
}
}
I am getting the result as below.
null {"end":1649362835.665466000,"vehicleType":"TRUCK","vehicleModelType":"HONDA","count":3}
null {"end":1649362825.656024000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":1}
null {"end":1649362850.675786000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":3}
null {"end":1649362855.677596000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA","count":1}
But is there a way to validate this ?
Also other question is I am trying to aggregate the result based on multiple keys is AggregateFunction the correct way to do this.
I am asking this as I saw this How can I sum multiple fields in Flink?
So If I have to aggregate sum on multiple fields can aggregate function accomplish the same ?(the way I wrote the code)
Kindly let me know. Thanks in advance.

Merge will only be called if you are using windows that merge -- in other words, if you are using session windows, or a custom merging window.
The correct way to aggregate based on multiple keys is to use keyBy with a composite type, such as Tuple2<VehicleType, VehicleModelType>>. Each time you call keyBy the stream is repartitioned from scratch (and not in addition to any previous partitioning).

Related

Flink Streaming RichSource early stops

It runs with processing time and using a broadcast state.
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
BroadcastStream<List<TableOperations>> broadcastOperationsState = env
.addSource(new LoadCassandraOperations(10000L, cassandraHost, cassandraPort)).broadcast(descriptor);
SingleOutputStreamOperator<InternalVariableValue> stream =
env.addSource(new SourceMillisInternalVariableValue(5000L));
SingleOutputStreamOperator<InternalVariableOperation> streamProcessed =
stream
.keyBy(InternalVariableValue::getUuid)
.connect(broadcastOperationsState)
.process(new AddOperationInfo())
;
streamProcessed.print();
SourceMillisIntervalVariableValues create a event every 5s . The events are stored in a static collection. The run method looks like :
public class SourceMillisInternalVariableValue extends RichSourceFunction<InternalVariableValue>{
private boolean running;
long millis;
public SourceMillisInternalVariableValue(long millis) {
super();
this.millis = millis;
}
#Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
running = true;
}
#Override
public void cancel() {
running = false;
}
#Override
public void run(SourceContext<InternalVariableValue> ctx) throws Exception {
//Espera inicial
Thread.sleep(1500);
PojoVariableValues[] pojoData =
new PojoVariableValues[]{
new PojoVariableValues("id1", "1"),
new PojoVariableValues("id2", "2"),
....
....
new PojoVariableValues("id21", "21")
};
int cont = 0;
while (cont<pojoData.length) {
System.out.println("Iteration "+cont+" "+pojoData.length);
ctx.collect(generateVar(pojoData[0+cont].getUUID(), pojoData[0+cont].getValue()));
ctx.collect(generateVar(pojoData[1+cont].getUUID(), pojoData[1+cont].getValue()));
ctx.collect(generateVar(pojoData[2+cont].getUUID(), pojoData[2+cont].getValue()));
cont = cont +3;
Thread.sleep(millis);
}
}
private InternalVariableValue generateVar(String uuid, String value)
{
return InternalVariableValueMessage.InternalVariableValue.newBuilder()
.setUuid(uuid)
.setTimestamp(new Date().getTime()).setValue(value).setKeyspace("nest").build();
}
class PojoVariableValues {
private String UUID;
private String Value;
public PojoVariableValues(String uUID, String value) {
super();
UUID = uUID;
Value = value;
}
public String getUUID() {
return UUID;
}
public void setUUID(String uUID) {
UUID = uUID;
}
public String getValue() {
return Value;
}
public void setValue(String value) {
Value = value;
}
}
}
LoadCassandraOperations emits events every 10 seconds. It works fine.
When I run this code, SourceMillisIntervalVariableValues stops in the first iteration, emiting only three events. If I comment the process function, both sources works properly, but if I run the process , the source is cancel...
I spect than the source emits all events ( 21 exactly ) , and all of them are processing in the aggregate function. If I run this code, the while loop in the sources only complete one iteration.
Any idea ?
Thank youuu . cheers
EDIT:
Important. This code is for explore the processint time and broadcast feature. I know that I'm not using the best practices in the sources. Thanks
EDIT 2:
The problem starts when I try to run the process function.
Solved !!
The problem was that I try to run it using a TestContainer and I can't watch any logs.
I ran it with a simple main method and I can see some code errors ( like the commented in the comments. Tnks !!! ).

Flink sink never executes

I have a program that streams cryptocurrency prices into a flink pipeline, and prints the highest bid for a time window.
Main.java
public class Main {
private final static Logger log = LoggerFactory.getLogger(Main.class);
private final static DateFormat dateFormat = new SimpleDateFormat("y-M-d H:m:s");
private final static NumberFormat numberFormat = new DecimalFormat("#0.00");
public static void main(String[] args) throws Exception {
MultipleParameterTool multipleParameterTool = MultipleParameterTool.fromArgs(args);
StreamExecutionEnvironment streamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
streamExecutionEnvironment.getConfig().setGlobalJobParameters(multipleParameterTool);
streamExecutionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
streamExecutionEnvironment.addSource(new GdaxSourceFunction())
.name("Gdax Exchange Price Source")
.assignTimestampsAndWatermarks(new WatermarkStrategy<TickerPrice>() {
#Override
public WatermarkGenerator<TickerPrice> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new BoundedOutOfOrdernessGenerator();
}
})
.windowAll(TumblingEventTimeWindows.of(Time.milliseconds(100)))
.trigger(EventTimeTrigger.create())
.reduce((ReduceFunction<TickerPrice>) (value1, value2) ->
value1.getHighestBid() > value2.getHighestBid() ? value1 : value2)
.addSink(new SinkFunction<TickerPrice>() {
#Override
public void invoke(TickerPrice value, Context context) throws Exception {
String dateString = dateFormat.format(context.timestamp());
String valueString = "$" + numberFormat.format(value.getHighestBid());
log.info(dateString + " : " + valueString);
}
}).name("Highest Bid Logger");
streamExecutionEnvironment.execute("Gdax Highest bid window calculator");
}
/**
* This generator generates watermarks assuming that elements arrive out of order,
* but only to a certain degree. The latest elements for a certain timestamp t will arrive
* at most n milliseconds after the earliest elements for timestamp t.
*/
public static class BoundedOutOfOrdernessGenerator implements WatermarkGenerator<TickerPrice> {
private final long maxOutOfOrderness = 3500; // 3.5 seconds
private long currentMaxTimestamp;
#Override
public void onEvent(TickerPrice event, long eventTimestamp, WatermarkOutput output) {
currentMaxTimestamp = Math.max(currentMaxTimestamp, eventTimestamp);
}
#Override
public void onPeriodicEmit(WatermarkOutput output) {
// emit the watermark as current highest timestamp minus the out-of-orderness bound
output.emitWatermark(new Watermark(currentMaxTimestamp - maxOutOfOrderness - 1));
}
}
}
GdaxSourceFunction.java
public class GdaxSourceFunction extends WebSocketClient implements SourceFunction<TickerPrice> {
private static String URL = "wss://ws-feed.gdax.com";
private static Logger log = LoggerFactory.getLogger(GdaxSourceFunction.class);
private static String subscribeMsg = "{\n" +
" \"type\": \"subscribe\",\n" +
" \"product_ids\": [<productIds>],\n" +
" \"channels\": [\n" +
//TODO: uncomment to re-enable order book tracking
//" \"level2\",\n" +
" {\n" +
" \"name\": \"ticker\",\n" +
" \"product_ids\": [<productIds>]\n" +
" }\n"+
" ]\n" +
"}";
SourceContext<TickerPrice> ctx;
#Override
public void run(SourceContext<TickerPrice> ctx) throws Exception {
this.ctx = ctx;
openConnection().get();
while(isOpen()) {
Thread.sleep(10000);
}
}
#Override
public void cancel() {
}
#Override
public void onMessage(String message) {
try {
ObjectNode objectNode = objectMapper.readValue(message, ObjectNode.class);
String type = objectNode.get("type").asText();
if("ticker".equals(type)) {
TickerPrice tickerPrice = new TickerPrice();
String productId = objectNode.get("product_id").asText();
String[] currencies = productId.split("-");
tickerPrice.setFromCurrency(currencies[1]);
tickerPrice.setToCurrency(currencies[0]);
tickerPrice.setHighestBid(objectNode.get("best_bid").asDouble());
tickerPrice.setLowestOffer(objectNode.get("best_ask").asDouble());
tickerPrice.setExchange("gdax");
String time = objectNode.get("time").asText();
Instant instant = Instant.parse(time);
ctx.collectWithTimestamp(tickerPrice, instant.getEpochSecond());
}
//log.info(objectNode.toString());
} catch (JsonProcessingException e) {
e.printStackTrace();
}
}
#Override
public void onOpen(Session session) {
super.onOpen(session);
//Authenticate and ensure we can properly connect to Gdax Websocket
//construct auth message with list of product ids
StringBuilder productIds = new StringBuilder("");
productIds.append("" +
"\"ETH-USD\",\n" +
"\"ETH-USD\",\n" +
"\"BTC-USD\"");
String subMsg = subscribeMsg.replace("<productIds>", productIds.toString());
try {
userSession.getAsyncRemote().sendText(subMsg).get();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
}
#Override
public String getUrl() {
return URL;
}
}
but the sink function is never called. I have verified that the reducer is executing (very fast, every 100 milliseconds). If I remove the windowing part and just print the bid for every record coming in, the program works. But I've followed all the tutorials on windowing, and I see no difference between what I'm doing here and what's shown in the tutorials. I don't know why the flink sink would not execute in windowed mode.
I copied the BoundedOutOfOrdernessGenerator class directly from this tutorial. It should work for my use case. Within 3600 miliseconds, I should see my first record in the logs but I don't. I debugged the program and the sink function never executes. If I remove these lines:
.assignTimestampsAndWatermarks(new WatermarkStrategy<TickerPrice>() {
#Override
public WatermarkGenerator<TickerPrice> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new BoundedOutOfOrdernessGenerator();
}
})
.windowAll(TumblingEventTimeWindows.of(Time.milliseconds(100)))
.trigger(EventTimeTrigger.create())
.reduce((ReduceFunction<TickerPrice>) (value1, value2) ->
value1.getHighestBid() > value2.getHighestBid() ? value1 : value2)
so that the stream creation code looks like:
streamExecutionEnvironment.addSource(new GdaxSourceFunction())
.name("Gdax Exchange Price Source")
.addSink(new SinkFunction<TickerPrice>() {
#Override
public void invoke(TickerPrice value, Context context) throws Exception {
String dateString = dateFormat.format(context.timestamp());
String valueString = "$" + numberFormat.format(value.getHighestBid());
log.info(dateString + " : " + valueString);
}
}).name("Highest Bid Logger");
The sink executes, but of course the results aren't windowed so they're incorrect for my use case. But that shows that something is wrong with my windowing logic but I don't know what it is.
Versions:
JDK 1.8
Flink 1.11.2
I believe the cause of this issue is that the timestamps produced by your custom source are in units of seconds, while window durations are always measured in milliseconds. Try changing
ctx.collectWithTimestamp(tickerPrice, instant.getEpochSecond());
to
ctx.collectWithTimestamp(tickerPrice, instant.getEpochMilli());
I would also suggest some other (largely unrelated) changes.
streamExecutionEnvironment.addSource(new GdaxSourceFunction())
.name("Gdax Exchange Price Source")
.uid("source")
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<TickerPrice>forBoundedOutOfOrderness(Duration.ofMillis(3500))
)
.windowAll(TumblingEventTimeWindows.of(Time.milliseconds(100)))
.reduce((ReduceFunction<TickerPrice>) (value1, value2) ->
value1.getHighestBid() > value2.getHighestBid() ? value1 : value2)
.uid("window")
.addSink(new SinkFunction<TickerPrice>() { ... }
.uid("sink")
Note the following recommendations:
Remove the BoundedOutOfOrdernessGenerator. There's no need to reimplement the built-in bounded-out-of-orderness watermark generator.
Remove the window trigger. There appears to be no need to override the default trigger, and if you get it wrong, it will cause problems.
Add UIDs to each stateful operator. These will be needed if you ever want to do stateful upgrades of your application after changing the job topology. (Your current sink isn't stateful, but adding a UID to it won't hurt.)

Apache Flink read at least 2 record to trigger sink

I am write my Apache Flink(1.10) to update records real time like this:
public class WalletConsumeRealtimeHandler {
public static void main(String[] args) throws Exception {
walletConsumeHandler();
}
public static void walletConsumeHandler() throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
FlinkUtil.initMQ();
FlinkUtil.initEnv(env);
DataStream<String> dataStreamSource = env.addSource(FlinkUtil.initDatasource("wallet.consume.report.realtime"));
DataStream<ReportWalletConsumeRecord> consumeRecord =
dataStreamSource.map(new MapFunction<String, ReportWalletConsumeRecord>() {
#Override
public ReportWalletConsumeRecord map(String value) throws Exception {
ObjectMapper mapper = new ObjectMapper();
ReportWalletConsumeRecord consumeRecord = mapper.readValue(value, ReportWalletConsumeRecord.class);
consumeRecord.setMergedRecordCount(1);
return consumeRecord;
}
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessGenerator());
consumeRecord.keyBy(
new KeySelector<ReportWalletConsumeRecord, Tuple2<String, Long>>() {
#Override
public Tuple2<String, Long> getKey(ReportWalletConsumeRecord value) throws Exception {
return Tuple2.of(value.getConsumeItem(), value.getTenantId());
}
})
.timeWindow(Time.seconds(5))
.reduce(new SumField(), new CollectionWindow())
.addSink(new SinkFunction<List<ReportWalletConsumeRecord>>() {
#Override
public void invoke(List<ReportWalletConsumeRecord> reportPumps, Context context) throws Exception {
WalletConsumeRealtimeHandler.invoke(reportPumps);
}
});
env.execute(WalletConsumeRealtimeHandler.class.getName());
}
private static class CollectionWindow extends ProcessWindowFunction<ReportWalletConsumeRecord,
List<ReportWalletConsumeRecord>,
Tuple2<String, Long>,
TimeWindow> {
public void process(Tuple2<String, Long> key,
Context context,
Iterable<ReportWalletConsumeRecord> minReadings,
Collector<List<ReportWalletConsumeRecord>> out) throws Exception {
ArrayList<ReportWalletConsumeRecord> employees = Lists.newArrayList(minReadings);
if (employees.size() > 0) {
out.collect(employees);
}
}
}
private static class SumField implements ReduceFunction<ReportWalletConsumeRecord> {
public ReportWalletConsumeRecord reduce(ReportWalletConsumeRecord d1, ReportWalletConsumeRecord d2) {
Integer merged1 = d1.getMergedRecordCount() == null ? 1 : d1.getMergedRecordCount();
Integer merged2 = d2.getMergedRecordCount() == null ? 1 : d2.getMergedRecordCount();
d1.setMergedRecordCount(merged1 + merged2);
d1.setConsumeNum(d1.getConsumeNum() + d2.getConsumeNum());
return d1;
}
}
public static void invoke(List<ReportWalletConsumeRecord> records) {
WalletConsumeService service = FlinkUtil.InitRetrofit().create(WalletConsumeService.class);
Call<ResponseBody> call = service.saveRecords(records);
call.enqueue(new Callback<ResponseBody>() {
#Override
public void onResponse(Call<ResponseBody> call, Response<ResponseBody> response) {
}
#Override
public void onFailure(Call<ResponseBody> call, Throwable t) {
t.printStackTrace();
}
});
}
}
and now I found the Flink task only receive at least 2 records to trigger sink, is the reduce action need this?
You need two records to trigger the window. Flink only knows when to close a window (and fire subsequent calculation) when it receives a watermark that is larger than the configured value of the end of the window.
In your case, you use BoundedOutOfOrdernessGenerator, which updates the watermark according to the incoming records. So it generates a second watermark only after having seen the second record.
You can use a different watermark generator. In the troubleshooting training there is a watermark generator that also generates watermarks on timeout.

How to sort the union datastream of flink without watermark

The flink flow has multi data stream, then I merge those data stream with org.apache.flink.streaming.api.datastream.DataStream#union method.
Then, I got the problem, the datastream is disordered and I can not set window to sort the data in data stream.
Sorting union of streams to identify user sessions in Apache Flink
I got the the answer, but the com.liam.learn.flink.example.union.UnionStreamDemo.SortFunction#onTimer
never been invoked.
Environment Info: flink version 1.7.0
In general, I hope to sort the union datastream witout watermark.
You need watermarks so that the sorting function knows when it can safely emit sorted elements. Without watermarks, you get get an record from stream B that has an earlier date than any of the first N records of stream A, right?
But adding watermarks is easy, especially if you know that "event time" is strictly increasing for any one stream. Below is some code I wrote that extends what David Anderson posted in his answer to the other SO issue you referenced above - hopefully this will get you started.
-- Ken
package com.scaleunlimited.flinksnippets;
import java.util.PriorityQueue;
import java.util.Random;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.TimerService;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.util.Collector;
import org.junit.Test;
public class MergeAndSortStreamsTest {
#Test
public void testMergeAndSort() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(2);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<Event> streamA = env.addSource(new EventSource("A"))
.assignTimestampsAndWatermarks(new EventTSWAssigner());
DataStream<Event> streamB = env.addSource(new EventSource("B"))
.assignTimestampsAndWatermarks(new EventTSWAssigner());
streamA.union(streamB)
.keyBy(r -> r.getKey())
.process(new SortByTimestampFunction())
.print();
env.execute();
}
private static class Event implements Comparable<Event> {
private String _label;
private long _timestamp;
public Event(String label, long timestamp) {
_label = label;
_timestamp = timestamp;
}
public String getLabel() {
return _label;
}
public void setLabel(String label) {
_label = label;
}
public String getKey() {
return "1";
}
public long getTimestamp() {
return _timestamp;
}
public void setTimestamp(long timestamp) {
_timestamp = timestamp;
}
#Override
public String toString() {
return String.format("%s # %d", _label, _timestamp);
}
#Override
public int compareTo(Event o) {
return Long.compare(_timestamp, o._timestamp);
}
}
#SuppressWarnings("serial")
private static class EventTSWAssigner extends AscendingTimestampExtractor<Event> {
#Override
public long extractAscendingTimestamp(Event element) {
return element.getTimestamp();
}
}
#SuppressWarnings("serial")
private static class SortByTimestampFunction extends KeyedProcessFunction<String, Event, Event> {
private ValueState<PriorityQueue<Event>> queueState = null;
#Override
public void open(Configuration config) {
ValueStateDescriptor<PriorityQueue<Event>> descriptor = new ValueStateDescriptor<>(
// state name
"sorted-events",
// type information of state
TypeInformation.of(new TypeHint<PriorityQueue<Event>>() {
}));
queueState = getRuntimeContext().getState(descriptor);
}
#Override
public void processElement(Event event, Context context, Collector<Event> out) throws Exception {
TimerService timerService = context.timerService();
long currentWatermark = timerService.currentWatermark();
System.out.format("processElement called with watermark %d\n", currentWatermark);
if (context.timestamp() > currentWatermark) {
PriorityQueue<Event> queue = queueState.value();
if (queue == null) {
queue = new PriorityQueue<>(10);
}
queue.add(event);
queueState.update(queue);
timerService.registerEventTimeTimer(event.getTimestamp());
}
}
#Override
public void onTimer(long timestamp, OnTimerContext context, Collector<Event> out) throws Exception {
PriorityQueue<Event> queue = queueState.value();
long watermark = context.timerService().currentWatermark();
System.out.format("onTimer called with watermark %d\n", watermark);
Event head = queue.peek();
while (head != null && head.getTimestamp() <= watermark) {
out.collect(head);
queue.remove(head);
head = queue.peek();
}
}
}
#SuppressWarnings("serial")
private static class EventSource extends RichParallelSourceFunction<Event> {
private String _prefix;
private transient Random _rand;
private transient boolean _running;
private transient int _numEvents;
public EventSource(String prefix) {
_prefix = prefix;
}
#Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
_rand = new Random(_prefix.hashCode() + getRuntimeContext().getIndexOfThisSubtask());
}
#Override
public void cancel() {
_running = false;
}
#Override
public void run(SourceContext<Event> context) throws Exception {
_running = true;
_numEvents = 0;
long timestamp = System.currentTimeMillis() + _rand.nextInt(10);
while (_running && (_numEvents < 100)) {
long deltaTime = timestamp - System.currentTimeMillis();
if (deltaTime > 0) {
Thread.sleep(deltaTime);
}
context.collect(new Event(_prefix, timestamp));
_numEvents++;
// Generate a timestamp every 5...15 ms, average is 10.
timestamp += (5 + _rand.nextInt(10));
}
}
}
}

Why CEP doesn't print the first event only after I input second event when using ProcessingTime?

I sent one event with isStart true to kafka ,and made Flink consumed the event from the kafka, also set the TimeCharacteristic to ProcessingTime and set within(Time.seconds(5)), so I expected that CEP would print the event after 5 seconds I sent the first event, however it didn't, and it printed the first event only after I sent the second event to kafka. Why it printed the first event only I after sent two events? Didn't it should be print the event just after 5 seconds I sent the first one when using ProcessingTime ?
The following is the code:
public class LongRidesWithKafka {
private static final String LOCAL_ZOOKEEPER_HOST = "localhost:2181";
private static final String LOCAL_KAFKA_BROKER = "localhost:9092";
private static final String RIDE_SPEED_GROUP = "rideSpeedGroup";
private static final int MAX_EVENT_DELAY = 60; // rides are at most 60 sec out-of-order.
public static void main(String[] args) throws Exception {
final int popThreshold = 1; // threshold for popular places
// set up streaming execution environment
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
Properties kafkaProps = new Properties();
//kafkaProps.setProperty("zookeeper.connect", LOCAL_ZOOKEEPER_HOST);
kafkaProps.setProperty("bootstrap.servers", LOCAL_KAFKA_BROKER);
kafkaProps.setProperty("group.id", RIDE_SPEED_GROUP);
// always read the Kafka topic from the start
kafkaProps.setProperty("auto.offset.reset", "earliest");
// create a Kafka consumer
FlinkKafkaConsumer011<TaxiRide> consumer = new FlinkKafkaConsumer011<>(
"flinktest",
new TaxiRideSchema(),
kafkaProps);
// assign a timestamp extractor to the consumer
//consumer.assignTimestampsAndWatermarks(new CustomWatermarkExtractor());
DataStream<TaxiRide> rides = env.addSource(consumer);
DataStream<TaxiRide> keyedRides = rides.keyBy("rideId");
// A complete taxi ride has a START event followed by an END event
Pattern<TaxiRide, TaxiRide> completedRides =
Pattern.<TaxiRide>begin("start")
.where(new SimpleCondition<TaxiRide>() {
#Override
public boolean filter(TaxiRide ride) throws Exception {
return ride.isStart;
}
})
.next("end")
.where(new SimpleCondition<TaxiRide>() {
#Override
public boolean filter(TaxiRide ride) throws Exception {
return !ride.isStart;
}
});
// We want to find rides that have NOT been completed within 120 minutes
PatternStream<TaxiRide> patternStream = CEP.pattern(keyedRides, completedRides.within(Time.seconds(5)));
OutputTag<TaxiRide> timedout = new OutputTag<TaxiRide>("timedout") {
};
SingleOutputStreamOperator<TaxiRide> longRides = patternStream.flatSelect(
timedout,
new LongRides.TaxiRideTimedOut<TaxiRide>(),
new LongRides.FlatSelectNothing<TaxiRide>()
);
longRides.getSideOutput(timedout).print();
env.execute("Long Taxi Rides");
}
public static class TaxiRideTimedOut<TaxiRide> implements PatternFlatTimeoutFunction<TaxiRide, TaxiRide> {
#Override
public void timeout(Map<String, List<TaxiRide>> map, long l, Collector<TaxiRide> collector) throws Exception {
TaxiRide rideStarted = map.get("start").get(0);
collector.collect(rideStarted);
}
}
public static class FlatSelectNothing<T> implements PatternFlatSelectFunction<T, T> {
#Override
public void flatSelect(Map<String, List<T>> pattern, Collector<T> collector) {
}
}
private static class TaxiRideTSExtractor extends AscendingTimestampExtractor<TaxiRide> {
private static final long serialVersionUID = 1L;
#Override
public long extractAscendingTimestamp(TaxiRide ride) {
// Watermark Watermark = getCurrentWatermark();
if (ride.isStart) {
return ride.startTime.getMillis();
} else {
return ride.endTime.getMillis();
}
}
}
private static class CustomWatermarkExtractor implements AssignerWithPeriodicWatermarks<TaxiRide> {
private static final long serialVersionUID = -742759155861320823L;
private long currentTimestamp = Long.MIN_VALUE;
#Override
public long extractTimestamp(TaxiRide ride, long previousElementTimestamp) {
// the inputs are assumed to be of format (message,timestamp)
if (ride.isStart) {
this.currentTimestamp = ride.startTime.getMillis();
return ride.startTime.getMillis();
} else {
this.currentTimestamp = ride.endTime.getMillis();
return ride.endTime.getMillis();
}
}
#Nullable
#Override
public Watermark getCurrentWatermark() {
return new Watermark(currentTimestamp == Long.MIN_VALUE ? Long.MIN_VALUE : currentTimestamp - 1);
}
}
}
The reason is that Flink's CEP library currently only checks the timestamps if another element arrives and is processed. The underlying assumption is that you have a steady flow of events.
I think this is a limitation of Flink's CEP library. To work correctly, Flink should register processing time timers with arrivalTime + timeout which trigger the timeout of patterns if no events arrive.

Resources