When will flink expire it's TimeWindow result from Queryablestate? - apache-flink

I have implemented Total WordCount example with Tumbling window and QueryableState.
I have taken the 10 seconds time window and when I print the result it display the correct result but when I use queryable state and make a query using the QueryableClient then it caches the last result of Time window even if the Time window change.
e.g, Word count for 'Nirav' is 5 for time window 11:00:01 to 11:00:10
When I query for 'Nirav' on time 11:00:50 then it returns the previous count 5.
So I have two question:
Is this default behaviour of Flink's QueryableStateClient which cache the last output for the same key until the new state for that key?
How can I clear the previous result when the Time Window finish?
Queryable Implementation is below
int sec = 10;
Time seconds = Time.seconds(sec);
text.flatMap(new FlatMapFunction<String, WordWithCount>() {
public void flatMap(String value, Collector<WordWithCount> out) {
for (String word : value.split("\\s")) {
out.collect(new WordWithCount(word, 1L));
}
}
})
.keyBy("word")
.timeWindow(seconds)
.reduce(new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) {
System.out.println("After time window fun:- a.word:" + a.word + ", a.count:" + a.count + ", b.word:" + b.word + ", b.count:" + b.count);
return new WordWithCount(a.word, a.count + b.count);
}
})
.keyBy(wordWithCount -> wordWithCount.word)
.asQueryableState("wordCountQuery", valueStateDescriptor)
Whole implementation
SocketWindowWordCountWithQueryableStateWithTimeWindow.java
package com.nirav.modi;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.state.ReducingStateDescriptor;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class SocketWindowWordCountWithQueryableStateWithTimeWindow {
public static void main(String[] args) throws Exception {
// the port to connect to
final int port;
try {
final ParameterTool params = ParameterTool.fromArgs(args);
port = params.getInt("port");
} catch (Exception e) {
System.err.println("No port specified. Please run 'SocketWindowWordCount --port <port>'");
return;
}
// get the execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE);
// get input data by connecting to the socket
DataStream<String> text = env.socketTextStream("localhost", port);
ReduceFunction<WordWithCount> reduceFunction = new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) {
System.out.println("reduce fun:- a.word:" + a.word + ", a.count:" + a.count + ", b.word:" + b.word + ", b.count:" + b.count);
return new WordWithCount(a.word, a.count + b.count);
}
};
// ReducingStateDescriptor<WordWithCount> descriptor = new ReducingStateDescriptor<WordWithCount>("wordCountQuery", reduceFunction, WordWithCount.class);
ValueStateDescriptor<WordWithCount> valueStateDescriptor = new ValueStateDescriptor<WordWithCount>("wordCountQuery", WordWithCount.class);
int sec = 10;
Time seconds = Time.seconds(sec);
text.flatMap(new FlatMapFunction<String, WordWithCount>() {
public void flatMap(String value, Collector<WordWithCount> out) {
for (String word : value.split("\\s")) {
out.collect(new WordWithCount(word, 1L));
}
}
})
.keyBy("word")
.timeWindow(seconds)
.reduce(new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) {
System.out.println("After time window fun:- a.word:" + a.word + ", a.count:" + a.count + ", b.word:" + b.word + ", b.count:" + b.count);
return new WordWithCount(a.word, a.count + b.count);
}
}).keyBy(wordWithCount -> wordWithCount.word)
.asQueryableState("wordCountQuery", valueStateDescriptor);
env.getConfig().enableSysoutLogging();
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
System.out.println("[info] Window WordCount with Time Window Job ID: " + jobGraph.getJobID());
System.out.println();
env.execute("Socket Window WordCount with Time Window of " + sec + " seconds");
}
// Data type for words with count
public static class WordWithCount {
public String word;
public long count;
public WordWithCount() {
}
public WordWithCount(String word, long count) {
this.word = word;
this.count = count;
}
#Override
public String toString() {
return word + " : " + count;
}
}
}
QueryStateWithWindowTest.java
package com.nirav.modi;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.queryablestate.client.QueryableStateClient;
import scala.tools.jline_embedded.console.ConsoleReader;
import java.io.PrintWriter;
import java.net.UnknownHostException;
import java.util.concurrent.CompletableFuture;
public class QueryStateWithWindowTest {
public static void main(String[] args) throws Exception {
// the jobId to connect to
final String jobId;
final String queryableStateName;
try {
final ParameterTool params = ParameterTool.fromArgs(args);
jobId = params.get("jobId");
queryableStateName = params.get("queryableStateName");
} catch (Exception e) {
System.err.println("No jobId specified. Please run 'SocketWindowWordCount --jobId <jobId>'");
return;
}
try {
ValueStateDescriptor<SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount> valueStateDescriptor = new ValueStateDescriptor<SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount>("wordCountQuery", SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount.class);
QueryableStateClient client = new QueryableStateClient("truecomtelesoft", 9069);
ExecutionConfig config = new ExecutionConfig();
client.setExecutionConfig(config.enableClosureCleaner());
ConsoleReader reader = new ConsoleReader();
reader.setPrompt("$ ");
PrintWriter out = new PrintWriter(reader.getOutput());
String line;
while ((line = reader.readLine()) != null) {
String key = line.toLowerCase().trim();
out.printf("[info] Querying key '%s'\n", key);
try {
long start = System.currentTimeMillis();
CompletableFuture<ValueState<SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount>> kvState = client.getKvState(JobID.fromHexString(jobId), queryableStateName, key, BasicTypeInfo.STRING_TYPE_INFO, valueStateDescriptor);
try {
SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount wordWithCount = kvState.get().value();
long end = System.currentTimeMillis();
long duration = Math.max(0, end - start);
out.printf("%d (query took %d ms)\n", wordWithCount.count, duration);
} catch (Exception e) {
e.printStackTrace();
}
} catch (Exception e) {
out.println("Query failed because of the following Exception:");
e.printStackTrace(out);
}
}
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
}

The succinct answer to "when will state created by asQueryableState expire?" is never.
asQueryableState gets translated to an operator which uses the incoming records to update a queryable state instance via ValueState.update(value). These values never expire, but are overwritten as new records arrive for a given key. In your test application, this means that the queries are going to return the most recent non-zero count for the given word.
Clearly this isn't what you were trying to accomplish. You could use a ProcessFunction to expire stale entries. To do that, you could explicitly create your own keyed managed state, and store with each count the timestamp of the window that most recently updated the entry. Then you would use a Timer to clear older entries.
See this example of ProcessFunction. To expire the state (which this example doesn't do), call state.clear().

Related

Flink AggregateFunction find sum by multiple keys( validation process and testing)

I am using Apache flink on Kinesis Data Analytics.
Flink Version : 1.13.2
Jave : 1.11
I am consuming json messages from kafka. Sample Input records look as below
null {"plateNumber":"506b9910-74a7-4c3e-a885-b5e9717efe3a","vignetteStickerId":"9e69df3f-d728-4fc8-9b09-42104588f772","currentTimestamp":"2022/04/07 16:19:55","timestamp":1649362795.444459000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"5ffe0326-571e-4b97-8f7b-4f49aebb6993","vignetteStickerId":"6c2e1342-b096-4cc9-a92c-df61571c2c7d","currentTimestamp":"2022/04/07 16:20:00","timestamp":1649362800.638060000,"vehicleType":"CAR","vehicleModelType":"HONDA"}
null {"plateNumber":"d15f49f9-5550-4780-b260-83f3116ba64a","vignetteStickerId":"1366fbfe-7d0a-475f-9249-261ef1dd6de2","currentTimestamp":"2022/04/07 16:20:05","timestamp":1649362805.643749000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"803508fb-9701-438e-9028-01bb8d96a804","vignetteStickerId":"b534369f-533e-4c15-ac3f-fc28cf0f3aba","currentTimestamp":"2022/04/07 16:20:10","timestamp":1649362810.648813000,"vehicleType":"CAR","vehicleModelType":"FORD"}
I want to aggregate sum these records into 20 seconds window using vehicleType (CAR OR TRUCK) and vehicleModelType (TOYOTA,HONDA or FORD) . SQL Analogy (sum() ,Group by vehicleType, vehicleModelType)
I am using Aggregate function to achieve this.
import static java.util.Objects.isNull;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.springframework.stereotype.Component;
import com.helecloud.streams.demo.model.Vehicle;
import com.helecloud.streams.demo.model.VehicleStatistics;
#Component
public class VehicleStatisticsAggregator implements AggregateFunction<Vehicle, VehicleStatistics, VehicleStatistics> {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public VehicleStatistics createAccumulator() {
System.out.println("Creating Accumulator!!");
return new VehicleStatistics();
}
#Override
public VehicleStatistics add(Vehicle vehicle, VehicleStatistics vehicleStatistics) {
System.out.println("vehicle in add method : " + vehicle);
if (isNull(vehicleStatistics.getVehicleType())) {
vehicleStatistics.setVehicleType(vehicle.getVehicleType());
}
if (isNull(vehicleStatistics.getVehicleModelType())) {
vehicleStatistics.setVehicleModelType(vehicle.getVehicleModelType());
}
// if(isNull(vehicleStatistics.getStart())) {
//
// vehicleStatistics.setStart(vehicle.getTimestamp());
// }
// if(isNull(vehicleStatistics.getCurrentTimestamp())) {
//
// vehicleStatistics.setCurrentTimestamp(vehicle.getCurrentTimestamp());
// }
if (isNull(vehicleStatistics.getCount())) {
vehicleStatistics.setCount(1);
} else {
System.out.println("incrementing count for : vehicleStatistics : " + vehicleStatistics);
vehicleStatistics.setCount(vehicleStatistics.getCount() + 1);
}
vehicleStatistics.setEnd(vehicle.getTimestamp());
System.out.println("vehicleStatistics in add : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics getResult(VehicleStatistics vehicleStatistics) {
System.out.println("vehicleStatistics in getResult : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics merge(VehicleStatistics vehicleStatistics, VehicleStatistics accumulator) {
System.out.println("Coming to merge!!");
VehicleStatistics vs = new VehicleStatistics(
// vehicleStatistics.getStart(),
accumulator.getEnd(),
// vehicleStatistics.getCurrentTimestamp(),
vehicleStatistics.getVehicleType(), vehicleStatistics.getVehicleModelType(),
vehicleStatistics.getCount() + accumulator.getCount());
System.out.println("VehicleStatistics in Merge :" + vs);
return vs;
}
}
In the above code I am also not seeing the merge code being called.
Below is the main processing code
#Service
public class ProcessingService {
#Value("${kafka.bootstrap-servers}")
private String kafkaAddress;
#Value("${kafka.group-id}")
private String kafkaGroupId;
public static final String TOPIC = "flink_input";
public static final String VEHICLE_STATISTICS_TOPIC = "flink_output";
#Autowired
private VehicleDeserializationSchema vehicleDeserializationSchema;
#Autowired
private VehicleStatisticsSerializationSchema vehicleStatisticsSerializationSchema;
#PostConstruct
public void startFlinkStreamProcessing() {
try {
processVehicleStatistic();
} catch (Exception e) {
// log.error("Cannot process", e);
e.printStackTrace();
}
}
public void processVehicleStatistic() {
try {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
FlinkKafkaConsumer<Vehicle> consumer = createVehicleConsumerForTopic(TOPIC, kafkaAddress, kafkaGroupId);
consumer.setStartFromLatest();
System.out.println("Starting to consume!!");
consumer.assignTimestampsAndWatermarks(WatermarkStrategy.forMonotonousTimestamps());
FlinkKafkaProducer<VehicleStatistics> producer = createVehicleStatisticsProducer(VEHICLE_STATISTICS_TOPIC, kafkaAddress);
DataStream<Vehicle> inputMessagesStream = environment.addSource(consumer);
inputMessagesStream
.keyBy((vehicle -> vehicle.getVehicleType().ordinal()))
// .keyBy(vehicle -> vehicle.getVehicleModelType().ordinal())
// .keyBy(new KeySelector<Vehicle, Tuple2<VehicleType, VehicleModelType>>() {
//
// /**
// *
// */
// private static final long serialVersionUID = 1L;
//
// #Override
// public Tuple2<VehicleType, VehicleModelType> getKey(Vehicle vehicle) throws Exception {
// return Tuple2.of(vehicle.getVehicleType(), vehicle.getVehicleModelType());
// }
// })
// .filter(v -> CAR.equals(v.getVehicleType()))
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
// .windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.aggregate(new VehicleStatisticsAggregator())
.addSink(producer);
System.out.println("Adding to Sink!!");
environment.execute("Car Truck Counts By Model");
} catch(Exception e) {
e.printStackTrace();;
}
}
private FlinkKafkaConsumer<Vehicle> createVehicleConsumerForTopic(String topic, String kafkaAddress, String kafkaGroup ) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", kafkaAddress);
properties.setProperty("group.id", kafkaGroup);
return new FlinkKafkaConsumer<>(topic, vehicleDeserializationSchema, properties);
}
private FlinkKafkaProducer<VehicleStatistics> createVehicleStatisticsProducer(String topic, String kafkaAddress){
return new FlinkKafkaProducer<>(kafkaAddress, topic, vehicleStatisticsSerializationSchema);
}
}
I am getting the result as below.
null {"end":1649362835.665466000,"vehicleType":"TRUCK","vehicleModelType":"HONDA","count":3}
null {"end":1649362825.656024000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":1}
null {"end":1649362850.675786000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":3}
null {"end":1649362855.677596000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA","count":1}
But is there a way to validate this ?
Also other question is I am trying to aggregate the result based on multiple keys is AggregateFunction the correct way to do this.
I am asking this as I saw this How can I sum multiple fields in Flink?
So If I have to aggregate sum on multiple fields can aggregate function accomplish the same ?(the way I wrote the code)
Kindly let me know. Thanks in advance.
Merge will only be called if you are using windows that merge -- in other words, if you are using session windows, or a custom merging window.
The correct way to aggregate based on multiple keys is to use keyBy with a composite type, such as Tuple2<VehicleType, VehicleModelType>>. Each time you call keyBy the stream is repartitioned from scratch (and not in addition to any previous partitioning).

Flink Table API : AppendStreamTableSink doesn't support consuming update changes which is produced by node GroupAggregate

I am trying to generate aggregates on a Streaming Source and when i try to run the Table API queries i am getting the following Error.
AppendStreamTableSink doesn't support consuming update changes which is produced by node GroupAggregate
I am consuming the data from a Kafka Topic. Here is the Unit Test i am to mimic that behavior.
msg_type_1,Site_1,09/10/2020,00:00:00.037
msg_type_2,Site_1,09/10/2020,00:00:00.037
msg_type_1,Site_2,09/10/2020,00:00:00.037
msg_type_1,Site_3,09/10/2020,00:00:00.037
msg_type_1,Site_4,09/10/2020,00:00:00.037
msg_type_1,Site_5,09/10/2020,00:00:00.037
msg_type_1,Site_1,09/10/2020,00:00:00.037
msg_type_2,Site_1,09/10/2020,00:00:00.037
msg_type_3,Site_2,09/10/2020,00:00:00.037
msg_type_4,Site_1,09/10/2020,00:10:00.037
msg_type_1,Site_3,09/10/2020,00:10:00.037
msg_type_2,Site_1,09/10/2020,00:10:00.037
msg_type_3,Site_4,09/10/2020,00:10:00.037
msg_type_4,Site_1,09/10/2020,00:10:00.037
msg_type_1,Site_4,09/10/2020,00:10:00.037
msg_type_2,Site_5,09/10/2020,00:10:00.037
msg_type_4,Site_5,09/10/2020,00:10:00.037
msg_type_6,Site_5,09/10/2020,00:10:00.037
And here is the Unit Test i have for the aggregation.
#Test
public void loadSampleMessageFile() {
System.out.println(".loadSampleMessageFile() : ");
try {
String[] args = {};
StreamExecutionEnvironment streamingExecutionEnv = null;
streamingExecutionEnv = StreamExecutionEnvironment.getExecutionEnvironment();
streamingExecutionEnv.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
//streamingExecutionEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
ExecutionConfig config = streamingExecutionEnv.getConfig();
final ParameterTool paramTool = ParameterTool.fromArgs(args);
for (int index = 0; index < args.length; index++) {
logger.info("Argument =" + index + " Value" + args[index]);
}
streamingExecutionEnv.getConfig().setGlobalJobParameters(paramTool);
StreamTableEnvironment streamTableEnv = StreamTableEnvironment.create(streamingExecutionEnv);
SingleOutputStreamOperator<SampleMessage> dataStreamSource = streamingExecutionEnv
.readTextFile("C:\\temp\\sample_data.txt")
.map(new MapFunction<String, SampleMessage>() {
#Override
public SampleMessage map(String value) throws Exception {
return sampleMessageParser.parseMessage(value, null);
}
});
streamTableEnv.createTemporaryView("messages", dataStreamSource);
Table messagesTable = streamTableEnv.fromDataStream(dataStreamSource);
System.out.println("No.of Columns in Table =" + messagesTable.getSchema().getFieldCount());
logger.info("No.of Columns in Table =" + messagesTable.getSchema().getFieldCount());
for (int index = 0; index < messagesTable.getSchema().getFieldNames().length; index++) {
System.out.println("Field Name [" + index + "] = " + messagesTable.getSchema().getFieldNames()[index]);
}
TableResult distinctSiteResult = messagesTable.distinct().select($("site")).execute();
CloseableIterator distinctSiteResultIter = distinctSiteResult.collect();
int counter = 0;
List<String> sites = new ArrayList<>();
while (distinctSiteResultIter.hasNext()) {
sites.add((String) distinctSiteResultIter.next());
counter++;
}
System.out.println("Total No.of Distinct Sites =" + counter);
}
catch(Exception e){
e.printStackTrace();
}
}
And the support classes.
public class SampleMessage implements Serializable {
private String msgType;
private String site;
private Long timestamp;
public String getMsgType() {
return msgType;
}
public void setMsgType(String msgType) {
this.msgType = msgType;
}
public String getSite() {
return site;
}
public void setSite(String site) {
this.site = site;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public String toString(){
StringBuilder str = new StringBuilder();
str.append("SampleMessage[");
str.append(" msgType=");
str.append(msgType);
str.append(" site=");
str.append(site);
str.append(" timestamp=");
str.append(timestamp);
str.append(" ]");
return str.toString();
}
}
And here is the error i am getting.
.loadSampleMessageFile() :
No.of Columns in Table =3
Field Name [0] = msgType
Field Name [1] = site
Field Name [2] = timestamp
org.apache.flink.table.api.TableException: AppendStreamTableSink doesn't support consuming update changes which is produced by node GroupAggregate(groupBy=[msgType, site, timestamp], select=[msgType, site, timestamp])
You can confirm the version of flink.
The result of distinct will change continuously. The downstream should be RetractStreamTableSink.
The error shows that this version of flink collect is not supported upsert
The latest version of Flink collect already supports upsert

Flink sink never executes

I have a program that streams cryptocurrency prices into a flink pipeline, and prints the highest bid for a time window.
Main.java
public class Main {
private final static Logger log = LoggerFactory.getLogger(Main.class);
private final static DateFormat dateFormat = new SimpleDateFormat("y-M-d H:m:s");
private final static NumberFormat numberFormat = new DecimalFormat("#0.00");
public static void main(String[] args) throws Exception {
MultipleParameterTool multipleParameterTool = MultipleParameterTool.fromArgs(args);
StreamExecutionEnvironment streamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
streamExecutionEnvironment.getConfig().setGlobalJobParameters(multipleParameterTool);
streamExecutionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
streamExecutionEnvironment.addSource(new GdaxSourceFunction())
.name("Gdax Exchange Price Source")
.assignTimestampsAndWatermarks(new WatermarkStrategy<TickerPrice>() {
#Override
public WatermarkGenerator<TickerPrice> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new BoundedOutOfOrdernessGenerator();
}
})
.windowAll(TumblingEventTimeWindows.of(Time.milliseconds(100)))
.trigger(EventTimeTrigger.create())
.reduce((ReduceFunction<TickerPrice>) (value1, value2) ->
value1.getHighestBid() > value2.getHighestBid() ? value1 : value2)
.addSink(new SinkFunction<TickerPrice>() {
#Override
public void invoke(TickerPrice value, Context context) throws Exception {
String dateString = dateFormat.format(context.timestamp());
String valueString = "$" + numberFormat.format(value.getHighestBid());
log.info(dateString + " : " + valueString);
}
}).name("Highest Bid Logger");
streamExecutionEnvironment.execute("Gdax Highest bid window calculator");
}
/**
* This generator generates watermarks assuming that elements arrive out of order,
* but only to a certain degree. The latest elements for a certain timestamp t will arrive
* at most n milliseconds after the earliest elements for timestamp t.
*/
public static class BoundedOutOfOrdernessGenerator implements WatermarkGenerator<TickerPrice> {
private final long maxOutOfOrderness = 3500; // 3.5 seconds
private long currentMaxTimestamp;
#Override
public void onEvent(TickerPrice event, long eventTimestamp, WatermarkOutput output) {
currentMaxTimestamp = Math.max(currentMaxTimestamp, eventTimestamp);
}
#Override
public void onPeriodicEmit(WatermarkOutput output) {
// emit the watermark as current highest timestamp minus the out-of-orderness bound
output.emitWatermark(new Watermark(currentMaxTimestamp - maxOutOfOrderness - 1));
}
}
}
GdaxSourceFunction.java
public class GdaxSourceFunction extends WebSocketClient implements SourceFunction<TickerPrice> {
private static String URL = "wss://ws-feed.gdax.com";
private static Logger log = LoggerFactory.getLogger(GdaxSourceFunction.class);
private static String subscribeMsg = "{\n" +
" \"type\": \"subscribe\",\n" +
" \"product_ids\": [<productIds>],\n" +
" \"channels\": [\n" +
//TODO: uncomment to re-enable order book tracking
//" \"level2\",\n" +
" {\n" +
" \"name\": \"ticker\",\n" +
" \"product_ids\": [<productIds>]\n" +
" }\n"+
" ]\n" +
"}";
SourceContext<TickerPrice> ctx;
#Override
public void run(SourceContext<TickerPrice> ctx) throws Exception {
this.ctx = ctx;
openConnection().get();
while(isOpen()) {
Thread.sleep(10000);
}
}
#Override
public void cancel() {
}
#Override
public void onMessage(String message) {
try {
ObjectNode objectNode = objectMapper.readValue(message, ObjectNode.class);
String type = objectNode.get("type").asText();
if("ticker".equals(type)) {
TickerPrice tickerPrice = new TickerPrice();
String productId = objectNode.get("product_id").asText();
String[] currencies = productId.split("-");
tickerPrice.setFromCurrency(currencies[1]);
tickerPrice.setToCurrency(currencies[0]);
tickerPrice.setHighestBid(objectNode.get("best_bid").asDouble());
tickerPrice.setLowestOffer(objectNode.get("best_ask").asDouble());
tickerPrice.setExchange("gdax");
String time = objectNode.get("time").asText();
Instant instant = Instant.parse(time);
ctx.collectWithTimestamp(tickerPrice, instant.getEpochSecond());
}
//log.info(objectNode.toString());
} catch (JsonProcessingException e) {
e.printStackTrace();
}
}
#Override
public void onOpen(Session session) {
super.onOpen(session);
//Authenticate and ensure we can properly connect to Gdax Websocket
//construct auth message with list of product ids
StringBuilder productIds = new StringBuilder("");
productIds.append("" +
"\"ETH-USD\",\n" +
"\"ETH-USD\",\n" +
"\"BTC-USD\"");
String subMsg = subscribeMsg.replace("<productIds>", productIds.toString());
try {
userSession.getAsyncRemote().sendText(subMsg).get();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
}
#Override
public String getUrl() {
return URL;
}
}
but the sink function is never called. I have verified that the reducer is executing (very fast, every 100 milliseconds). If I remove the windowing part and just print the bid for every record coming in, the program works. But I've followed all the tutorials on windowing, and I see no difference between what I'm doing here and what's shown in the tutorials. I don't know why the flink sink would not execute in windowed mode.
I copied the BoundedOutOfOrdernessGenerator class directly from this tutorial. It should work for my use case. Within 3600 miliseconds, I should see my first record in the logs but I don't. I debugged the program and the sink function never executes. If I remove these lines:
.assignTimestampsAndWatermarks(new WatermarkStrategy<TickerPrice>() {
#Override
public WatermarkGenerator<TickerPrice> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new BoundedOutOfOrdernessGenerator();
}
})
.windowAll(TumblingEventTimeWindows.of(Time.milliseconds(100)))
.trigger(EventTimeTrigger.create())
.reduce((ReduceFunction<TickerPrice>) (value1, value2) ->
value1.getHighestBid() > value2.getHighestBid() ? value1 : value2)
so that the stream creation code looks like:
streamExecutionEnvironment.addSource(new GdaxSourceFunction())
.name("Gdax Exchange Price Source")
.addSink(new SinkFunction<TickerPrice>() {
#Override
public void invoke(TickerPrice value, Context context) throws Exception {
String dateString = dateFormat.format(context.timestamp());
String valueString = "$" + numberFormat.format(value.getHighestBid());
log.info(dateString + " : " + valueString);
}
}).name("Highest Bid Logger");
The sink executes, but of course the results aren't windowed so they're incorrect for my use case. But that shows that something is wrong with my windowing logic but I don't know what it is.
Versions:
JDK 1.8
Flink 1.11.2
I believe the cause of this issue is that the timestamps produced by your custom source are in units of seconds, while window durations are always measured in milliseconds. Try changing
ctx.collectWithTimestamp(tickerPrice, instant.getEpochSecond());
to
ctx.collectWithTimestamp(tickerPrice, instant.getEpochMilli());
I would also suggest some other (largely unrelated) changes.
streamExecutionEnvironment.addSource(new GdaxSourceFunction())
.name("Gdax Exchange Price Source")
.uid("source")
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<TickerPrice>forBoundedOutOfOrderness(Duration.ofMillis(3500))
)
.windowAll(TumblingEventTimeWindows.of(Time.milliseconds(100)))
.reduce((ReduceFunction<TickerPrice>) (value1, value2) ->
value1.getHighestBid() > value2.getHighestBid() ? value1 : value2)
.uid("window")
.addSink(new SinkFunction<TickerPrice>() { ... }
.uid("sink")
Note the following recommendations:
Remove the BoundedOutOfOrdernessGenerator. There's no need to reimplement the built-in bounded-out-of-orderness watermark generator.
Remove the window trigger. There appears to be no need to override the default trigger, and if you get it wrong, it will cause problems.
Add UIDs to each stateful operator. These will be needed if you ever want to do stateful upgrades of your application after changing the job topology. (Your current sink isn't stateful, but adding a UID to it won't hurt.)

Solr's labelled relationship indexation performance

I want to move from anonymous relationships (childDocuments) to labelled.
During testing, performance degradation was detected when integrating documents into Solr on identical schemas and documents.
Solr (8.1.1) configuration (local, 1 node, default settings): solr -e cloud
Test: start integration of 500 documents several times and calculate the average integration time.
Labelled relationship example:
{
"id": "parent_xxx",
"items": [{"id": "child_xxx"}]
}
Anonymous relationship example:
{
"id": "parent_xxx",
"_childDocuments_": [{"id": "child_xxx"}]
}
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.common.SolrInputDocument;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Scratch {
private static final int DOC_COUNT = 500;
private static final int ITERATION_COUNT = 5;
private static final boolean ANONYMOUS_CHILDREN = true;
private static final boolean LABELED_CHILDREN = false;
public static void main(String[] args) throws IOException, SolrServerException {
long anonymousTime = 0;
long labelledTime = 0;
for (int i = 0; i < ITERATION_COUNT; i++) {
List<SolrInputDocument> anonymousDocs = createSolrDocuments(ANONYMOUS_CHILDREN);
cleanSolrCollection();
anonymousTime += writeToSolr(anonymousDocs);
List<SolrInputDocument> labeledDocs = createSolrDocuments(LABELED_CHILDREN);
cleanSolrCollection();
labelledTime += writeToSolr(labeledDocs);
}
System.out.println("Avg anonymous time: " + (anonymousTime / ITERATION_COUNT));
System.out.println("Avg labelled time: " + (labelledTime / ITERATION_COUNT));
}
private static List<SolrInputDocument> createSolrDocuments(boolean isAnonymous) {
List<SolrInputDocument> request = new ArrayList<>();
for (int i = 0; i < DOC_COUNT; i++) {
SolrInputDocument parent = new SolrInputDocument();
parent.setField("id", "parent_" + i);
SolrInputDocument child = new SolrInputDocument();
child.setField("id", "child_" + i);
if (isAnonymous) {
parent.addChildDocument(child);
} else {
parent.addField("items", child);
}
request.add(parent);
}
return request;
}
private static void cleanSolrCollection() throws IOException, SolrServerException {
try (SolrClient client = getSolrClient()) {
client.deleteByQuery("main", "*:*");
}
}
private static long writeToSolr(List<SolrInputDocument> documents) throws IOException, SolrServerException {
long startAt = System.currentTimeMillis();
try (SolrClient client = getSolrClient()) {
client.add("main", documents);
}
return System.currentTimeMillis() - startAt;
}
private static SolrClient getSolrClient() {
return new HttpSolrClient.Builder("http://localhost:8983/solr")
.allowCompression(true)
.build();
}
}
Results:
500 docs with anonymous relationship ~ 29ms
500 docs with labelled relationship ~ 981ms
Is it normal behavior for Solr when working with named relationships?
I have not been able to find any information about that.
The performance difference of 20-30 times does look strange.

How is context used in a class usage?

The line of code that is giving me fits is:
this.databaseHandler = new DatabaseHandler(MainActivity.
I have that module in the project and this line is from another project that I am trying to incorporate. I believe I need this line and am having trouble getting the idea of context parameter as it is used here.
Yes, the line is incomplete because I can not finish it.
Could my whole structure or thinking be wrong?
import android.app.Activity;
import android.content.Context;
import android.os.Bundle;
import android.os.AsyncTask;
import com.Table.TableMainLayout;
import com.example.tablefreezepane.DatabaseHandler;
public class MainActivity extends Activity {
final String TAG = "MainActivity.java";
#Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
/* Loads next module */
setContentView(new TableMainLayout(this));
}
}
public class AsyncInsertData extends AsyncTask<String, String, String> {
DatabaseHandler databaseHandler;
String type;
long timeElapsed;
protected AsyncInsertData(String type){
this.type = type;
this.databaseHandler = new DatabaseHandler(MainActivity.
//(MainActivity.this);
}
// #type - can be 'normal' or 'fast'
//#Override
//protected void onPreExecute() {
// super.onPreExecute();
// tvStatus.setText("Inserting " + editTextRecordNum.getText() + " records...");
//}
#Override
protected String doInBackground(String... aurl) {
try {
// get number of records to be inserted
int insertCount = 20;
// empty the table
databaseHandler.deleteRecords();
// keep track of execution time
long lStartTime = System.nanoTime();
if (type.equals("normal")) {
databaseHandler.insertNormal(insertCount);
} else {
databaseHandler.insertFast(insertCount);
}
// execution finised
long lEndTime = System.nanoTime();
// display execution time
timeElapsed = lEndTime - lStartTime;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
protected void onPostExecute(String unused) {
//Toast.makeText(getApplicationContext(),"This is an Android Toast Message", Toast.LENGTH_LONG).show();
//tvStatus.setText("Done " + choice + " inserting " + databaseHandler.countRecords() + " records into table: [" + this.databaseHandler.tableName + "]. Time elapsed: " + timeElapsed / 1000000 + " ms.");
}
}
Thank you in advance.
Where this is async, you can't access the context from MainActivity the way you are. To do so, add constructor with a context parameter, then replace your MainActivity.this with context

Resources