Solr's labelled relationship indexation performance - solr

I want to move from anonymous relationships (childDocuments) to labelled.
During testing, performance degradation was detected when integrating documents into Solr on identical schemas and documents.
Solr (8.1.1) configuration (local, 1 node, default settings): solr -e cloud
Test: start integration of 500 documents several times and calculate the average integration time.
Labelled relationship example:
{
"id": "parent_xxx",
"items": [{"id": "child_xxx"}]
}
Anonymous relationship example:
{
"id": "parent_xxx",
"_childDocuments_": [{"id": "child_xxx"}]
}
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.common.SolrInputDocument;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class Scratch {
private static final int DOC_COUNT = 500;
private static final int ITERATION_COUNT = 5;
private static final boolean ANONYMOUS_CHILDREN = true;
private static final boolean LABELED_CHILDREN = false;
public static void main(String[] args) throws IOException, SolrServerException {
long anonymousTime = 0;
long labelledTime = 0;
for (int i = 0; i < ITERATION_COUNT; i++) {
List<SolrInputDocument> anonymousDocs = createSolrDocuments(ANONYMOUS_CHILDREN);
cleanSolrCollection();
anonymousTime += writeToSolr(anonymousDocs);
List<SolrInputDocument> labeledDocs = createSolrDocuments(LABELED_CHILDREN);
cleanSolrCollection();
labelledTime += writeToSolr(labeledDocs);
}
System.out.println("Avg anonymous time: " + (anonymousTime / ITERATION_COUNT));
System.out.println("Avg labelled time: " + (labelledTime / ITERATION_COUNT));
}
private static List<SolrInputDocument> createSolrDocuments(boolean isAnonymous) {
List<SolrInputDocument> request = new ArrayList<>();
for (int i = 0; i < DOC_COUNT; i++) {
SolrInputDocument parent = new SolrInputDocument();
parent.setField("id", "parent_" + i);
SolrInputDocument child = new SolrInputDocument();
child.setField("id", "child_" + i);
if (isAnonymous) {
parent.addChildDocument(child);
} else {
parent.addField("items", child);
}
request.add(parent);
}
return request;
}
private static void cleanSolrCollection() throws IOException, SolrServerException {
try (SolrClient client = getSolrClient()) {
client.deleteByQuery("main", "*:*");
}
}
private static long writeToSolr(List<SolrInputDocument> documents) throws IOException, SolrServerException {
long startAt = System.currentTimeMillis();
try (SolrClient client = getSolrClient()) {
client.add("main", documents);
}
return System.currentTimeMillis() - startAt;
}
private static SolrClient getSolrClient() {
return new HttpSolrClient.Builder("http://localhost:8983/solr")
.allowCompression(true)
.build();
}
}
Results:
500 docs with anonymous relationship ~ 29ms
500 docs with labelled relationship ~ 981ms
Is it normal behavior for Solr when working with named relationships?
I have not been able to find any information about that.
The performance difference of 20-30 times does look strange.

Related

Flink AggregateFunction find sum by multiple keys( validation process and testing)

I am using Apache flink on Kinesis Data Analytics.
Flink Version : 1.13.2
Jave : 1.11
I am consuming json messages from kafka. Sample Input records look as below
null {"plateNumber":"506b9910-74a7-4c3e-a885-b5e9717efe3a","vignetteStickerId":"9e69df3f-d728-4fc8-9b09-42104588f772","currentTimestamp":"2022/04/07 16:19:55","timestamp":1649362795.444459000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"5ffe0326-571e-4b97-8f7b-4f49aebb6993","vignetteStickerId":"6c2e1342-b096-4cc9-a92c-df61571c2c7d","currentTimestamp":"2022/04/07 16:20:00","timestamp":1649362800.638060000,"vehicleType":"CAR","vehicleModelType":"HONDA"}
null {"plateNumber":"d15f49f9-5550-4780-b260-83f3116ba64a","vignetteStickerId":"1366fbfe-7d0a-475f-9249-261ef1dd6de2","currentTimestamp":"2022/04/07 16:20:05","timestamp":1649362805.643749000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"803508fb-9701-438e-9028-01bb8d96a804","vignetteStickerId":"b534369f-533e-4c15-ac3f-fc28cf0f3aba","currentTimestamp":"2022/04/07 16:20:10","timestamp":1649362810.648813000,"vehicleType":"CAR","vehicleModelType":"FORD"}
I want to aggregate sum these records into 20 seconds window using vehicleType (CAR OR TRUCK) and vehicleModelType (TOYOTA,HONDA or FORD) . SQL Analogy (sum() ,Group by vehicleType, vehicleModelType)
I am using Aggregate function to achieve this.
import static java.util.Objects.isNull;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.springframework.stereotype.Component;
import com.helecloud.streams.demo.model.Vehicle;
import com.helecloud.streams.demo.model.VehicleStatistics;
#Component
public class VehicleStatisticsAggregator implements AggregateFunction<Vehicle, VehicleStatistics, VehicleStatistics> {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public VehicleStatistics createAccumulator() {
System.out.println("Creating Accumulator!!");
return new VehicleStatistics();
}
#Override
public VehicleStatistics add(Vehicle vehicle, VehicleStatistics vehicleStatistics) {
System.out.println("vehicle in add method : " + vehicle);
if (isNull(vehicleStatistics.getVehicleType())) {
vehicleStatistics.setVehicleType(vehicle.getVehicleType());
}
if (isNull(vehicleStatistics.getVehicleModelType())) {
vehicleStatistics.setVehicleModelType(vehicle.getVehicleModelType());
}
// if(isNull(vehicleStatistics.getStart())) {
//
// vehicleStatistics.setStart(vehicle.getTimestamp());
// }
// if(isNull(vehicleStatistics.getCurrentTimestamp())) {
//
// vehicleStatistics.setCurrentTimestamp(vehicle.getCurrentTimestamp());
// }
if (isNull(vehicleStatistics.getCount())) {
vehicleStatistics.setCount(1);
} else {
System.out.println("incrementing count for : vehicleStatistics : " + vehicleStatistics);
vehicleStatistics.setCount(vehicleStatistics.getCount() + 1);
}
vehicleStatistics.setEnd(vehicle.getTimestamp());
System.out.println("vehicleStatistics in add : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics getResult(VehicleStatistics vehicleStatistics) {
System.out.println("vehicleStatistics in getResult : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics merge(VehicleStatistics vehicleStatistics, VehicleStatistics accumulator) {
System.out.println("Coming to merge!!");
VehicleStatistics vs = new VehicleStatistics(
// vehicleStatistics.getStart(),
accumulator.getEnd(),
// vehicleStatistics.getCurrentTimestamp(),
vehicleStatistics.getVehicleType(), vehicleStatistics.getVehicleModelType(),
vehicleStatistics.getCount() + accumulator.getCount());
System.out.println("VehicleStatistics in Merge :" + vs);
return vs;
}
}
In the above code I am also not seeing the merge code being called.
Below is the main processing code
#Service
public class ProcessingService {
#Value("${kafka.bootstrap-servers}")
private String kafkaAddress;
#Value("${kafka.group-id}")
private String kafkaGroupId;
public static final String TOPIC = "flink_input";
public static final String VEHICLE_STATISTICS_TOPIC = "flink_output";
#Autowired
private VehicleDeserializationSchema vehicleDeserializationSchema;
#Autowired
private VehicleStatisticsSerializationSchema vehicleStatisticsSerializationSchema;
#PostConstruct
public void startFlinkStreamProcessing() {
try {
processVehicleStatistic();
} catch (Exception e) {
// log.error("Cannot process", e);
e.printStackTrace();
}
}
public void processVehicleStatistic() {
try {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
FlinkKafkaConsumer<Vehicle> consumer = createVehicleConsumerForTopic(TOPIC, kafkaAddress, kafkaGroupId);
consumer.setStartFromLatest();
System.out.println("Starting to consume!!");
consumer.assignTimestampsAndWatermarks(WatermarkStrategy.forMonotonousTimestamps());
FlinkKafkaProducer<VehicleStatistics> producer = createVehicleStatisticsProducer(VEHICLE_STATISTICS_TOPIC, kafkaAddress);
DataStream<Vehicle> inputMessagesStream = environment.addSource(consumer);
inputMessagesStream
.keyBy((vehicle -> vehicle.getVehicleType().ordinal()))
// .keyBy(vehicle -> vehicle.getVehicleModelType().ordinal())
// .keyBy(new KeySelector<Vehicle, Tuple2<VehicleType, VehicleModelType>>() {
//
// /**
// *
// */
// private static final long serialVersionUID = 1L;
//
// #Override
// public Tuple2<VehicleType, VehicleModelType> getKey(Vehicle vehicle) throws Exception {
// return Tuple2.of(vehicle.getVehicleType(), vehicle.getVehicleModelType());
// }
// })
// .filter(v -> CAR.equals(v.getVehicleType()))
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
// .windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.aggregate(new VehicleStatisticsAggregator())
.addSink(producer);
System.out.println("Adding to Sink!!");
environment.execute("Car Truck Counts By Model");
} catch(Exception e) {
e.printStackTrace();;
}
}
private FlinkKafkaConsumer<Vehicle> createVehicleConsumerForTopic(String topic, String kafkaAddress, String kafkaGroup ) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", kafkaAddress);
properties.setProperty("group.id", kafkaGroup);
return new FlinkKafkaConsumer<>(topic, vehicleDeserializationSchema, properties);
}
private FlinkKafkaProducer<VehicleStatistics> createVehicleStatisticsProducer(String topic, String kafkaAddress){
return new FlinkKafkaProducer<>(kafkaAddress, topic, vehicleStatisticsSerializationSchema);
}
}
I am getting the result as below.
null {"end":1649362835.665466000,"vehicleType":"TRUCK","vehicleModelType":"HONDA","count":3}
null {"end":1649362825.656024000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":1}
null {"end":1649362850.675786000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":3}
null {"end":1649362855.677596000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA","count":1}
But is there a way to validate this ?
Also other question is I am trying to aggregate the result based on multiple keys is AggregateFunction the correct way to do this.
I am asking this as I saw this How can I sum multiple fields in Flink?
So If I have to aggregate sum on multiple fields can aggregate function accomplish the same ?(the way I wrote the code)
Kindly let me know. Thanks in advance.
Merge will only be called if you are using windows that merge -- in other words, if you are using session windows, or a custom merging window.
The correct way to aggregate based on multiple keys is to use keyBy with a composite type, such as Tuple2<VehicleType, VehicleModelType>>. Each time you call keyBy the stream is repartitioned from scratch (and not in addition to any previous partitioning).

Flink Table API : AppendStreamTableSink doesn't support consuming update changes which is produced by node GroupAggregate

I am trying to generate aggregates on a Streaming Source and when i try to run the Table API queries i am getting the following Error.
AppendStreamTableSink doesn't support consuming update changes which is produced by node GroupAggregate
I am consuming the data from a Kafka Topic. Here is the Unit Test i am to mimic that behavior.
msg_type_1,Site_1,09/10/2020,00:00:00.037
msg_type_2,Site_1,09/10/2020,00:00:00.037
msg_type_1,Site_2,09/10/2020,00:00:00.037
msg_type_1,Site_3,09/10/2020,00:00:00.037
msg_type_1,Site_4,09/10/2020,00:00:00.037
msg_type_1,Site_5,09/10/2020,00:00:00.037
msg_type_1,Site_1,09/10/2020,00:00:00.037
msg_type_2,Site_1,09/10/2020,00:00:00.037
msg_type_3,Site_2,09/10/2020,00:00:00.037
msg_type_4,Site_1,09/10/2020,00:10:00.037
msg_type_1,Site_3,09/10/2020,00:10:00.037
msg_type_2,Site_1,09/10/2020,00:10:00.037
msg_type_3,Site_4,09/10/2020,00:10:00.037
msg_type_4,Site_1,09/10/2020,00:10:00.037
msg_type_1,Site_4,09/10/2020,00:10:00.037
msg_type_2,Site_5,09/10/2020,00:10:00.037
msg_type_4,Site_5,09/10/2020,00:10:00.037
msg_type_6,Site_5,09/10/2020,00:10:00.037
And here is the Unit Test i have for the aggregation.
#Test
public void loadSampleMessageFile() {
System.out.println(".loadSampleMessageFile() : ");
try {
String[] args = {};
StreamExecutionEnvironment streamingExecutionEnv = null;
streamingExecutionEnv = StreamExecutionEnvironment.getExecutionEnvironment();
streamingExecutionEnv.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
//streamingExecutionEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
ExecutionConfig config = streamingExecutionEnv.getConfig();
final ParameterTool paramTool = ParameterTool.fromArgs(args);
for (int index = 0; index < args.length; index++) {
logger.info("Argument =" + index + " Value" + args[index]);
}
streamingExecutionEnv.getConfig().setGlobalJobParameters(paramTool);
StreamTableEnvironment streamTableEnv = StreamTableEnvironment.create(streamingExecutionEnv);
SingleOutputStreamOperator<SampleMessage> dataStreamSource = streamingExecutionEnv
.readTextFile("C:\\temp\\sample_data.txt")
.map(new MapFunction<String, SampleMessage>() {
#Override
public SampleMessage map(String value) throws Exception {
return sampleMessageParser.parseMessage(value, null);
}
});
streamTableEnv.createTemporaryView("messages", dataStreamSource);
Table messagesTable = streamTableEnv.fromDataStream(dataStreamSource);
System.out.println("No.of Columns in Table =" + messagesTable.getSchema().getFieldCount());
logger.info("No.of Columns in Table =" + messagesTable.getSchema().getFieldCount());
for (int index = 0; index < messagesTable.getSchema().getFieldNames().length; index++) {
System.out.println("Field Name [" + index + "] = " + messagesTable.getSchema().getFieldNames()[index]);
}
TableResult distinctSiteResult = messagesTable.distinct().select($("site")).execute();
CloseableIterator distinctSiteResultIter = distinctSiteResult.collect();
int counter = 0;
List<String> sites = new ArrayList<>();
while (distinctSiteResultIter.hasNext()) {
sites.add((String) distinctSiteResultIter.next());
counter++;
}
System.out.println("Total No.of Distinct Sites =" + counter);
}
catch(Exception e){
e.printStackTrace();
}
}
And the support classes.
public class SampleMessage implements Serializable {
private String msgType;
private String site;
private Long timestamp;
public String getMsgType() {
return msgType;
}
public void setMsgType(String msgType) {
this.msgType = msgType;
}
public String getSite() {
return site;
}
public void setSite(String site) {
this.site = site;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public String toString(){
StringBuilder str = new StringBuilder();
str.append("SampleMessage[");
str.append(" msgType=");
str.append(msgType);
str.append(" site=");
str.append(site);
str.append(" timestamp=");
str.append(timestamp);
str.append(" ]");
return str.toString();
}
}
And here is the error i am getting.
.loadSampleMessageFile() :
No.of Columns in Table =3
Field Name [0] = msgType
Field Name [1] = site
Field Name [2] = timestamp
org.apache.flink.table.api.TableException: AppendStreamTableSink doesn't support consuming update changes which is produced by node GroupAggregate(groupBy=[msgType, site, timestamp], select=[msgType, site, timestamp])
You can confirm the version of flink.
The result of distinct will change continuously. The downstream should be RetractStreamTableSink.
The error shows that this version of flink collect is not supported upsert
The latest version of Flink collect already supports upsert

Why is my Flink standalone-cluster not receiving my job?

I created a program in Flink (Java) to calculate the average of 9 fake sensors on 3 different rooms. The program runs fine if I start the jar file. So I decided to start the flink standalone-cluster to check the TaskManagers running my Job and respective tasks, like here (https://ci.apache.org/projects/flink/flink-docs-stable/tutorials/local_setup.html). I am running everything on my machine.
Why Can I not see the job running on the dashboard (http://localhost:8081/#/overview) but if I watch the log files (tail -f log/flink--client--*-T430.log) I can see something being processed?
Moreover, the print() method is spilling the output to the console.
I start my application with this command ./bin/flink run examples/explore-flink.jar -c
But maybe there is some parameter on a config file that I have to configure. Here is my code:
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.sense.flink.mqtt.MqttTemperature;
import org.sense.flink.mqtt.TemperatureMqttConsumer;
public class SensorsMultipleReadingMqttEdgentQEP {
private boolean checkpointEnable = true;
private long checkpointInterval = 1000;
private CheckpointingMode checkpointMode = CheckpointingMode.EXACTLY_ONCE;
public SensorsMultipleReadingMqttEdgentQEP() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
if (checkpointEnable)
env.enableCheckpointing(checkpointInterval, checkpointMode);
DataStream<MqttTemperature> temperatureStream01 = env.addSource(new TemperatureMqttConsumer("topic-edgent-01"));
DataStream<MqttTemperature> temperatureStream02 = env.addSource(new TemperatureMqttConsumer("topic-edgent-02"));
DataStream<MqttTemperature> temperatureStream03 = env.addSource(new TemperatureMqttConsumer("topic-edgent-03"));
DataStream<MqttTemperature> temperatureStreams = temperatureStream01.union(temperatureStream02)
.union(temperatureStream03);
DataStream<Tuple2<String, Double>> average = temperatureStreams.keyBy(new TemperatureKeySelector())
.map(new AverageTempMapper());
average.print();
String executionPlan = env.getExecutionPlan();
System.out.println("ExecutionPlan ........................ ");
System.out.println(executionPlan);
System.out.println("........................ ");
// env.execute("SensorsMultipleReadingMqttEdgentQEP");
env.execute();
}
public static class TemperatureKeySelector implements KeySelector<MqttTemperature, Integer> {
private static final long serialVersionUID = 5905504239899133953L;
#Override
public Integer getKey(MqttTemperature value) throws Exception {
return value.getId();
}
}
public static class AverageTempMapper extends RichMapFunction<MqttTemperature, Tuple2<String, Double>> {
private static final long serialVersionUID = -5489672634096634902L;
private MapState<String, Double> averageTemp;
#Override
public void open(Configuration parameters) throws Exception {
averageTemp = getRuntimeContext()
.getMapState(new MapStateDescriptor<>("average-temperature", String.class, Double.class));
}
#Override
public Tuple2<String, Double> map(MqttTemperature value) throws Exception {
String key = "no-room";
Double temp = value.getTemp();
if (value.getId().equals(1) || value.getId().equals(2) || value.getId().equals(3)) {
key = "room-A";
} else if (value.getId().equals(4) || value.getId().equals(5) || value.getId().equals(6)) {
key = "room-B";
} else if (value.getId().equals(7) || value.getId().equals(8) || value.getId().equals(9)) {
key = "room-C";
} else {
System.err.println("Sensor not defined in any room.");
}
if (averageTemp.contains(key)) {
temp = (averageTemp.get(key) + value.getTemp()) / 2;
} else {
averageTemp.put(key, temp);
}
return new Tuple2<String, Double>(key, temp);
}
}
}
Thanks,
Felipe
After I select the option "Extract required libraries into generated JAR" it worked. Strange because I was generating the JAR with the option "Package required libraries into generated JAR" and it was not working.

When will flink expire it's TimeWindow result from Queryablestate?

I have implemented Total WordCount example with Tumbling window and QueryableState.
I have taken the 10 seconds time window and when I print the result it display the correct result but when I use queryable state and make a query using the QueryableClient then it caches the last result of Time window even if the Time window change.
e.g, Word count for 'Nirav' is 5 for time window 11:00:01 to 11:00:10
When I query for 'Nirav' on time 11:00:50 then it returns the previous count 5.
So I have two question:
Is this default behaviour of Flink's QueryableStateClient which cache the last output for the same key until the new state for that key?
How can I clear the previous result when the Time Window finish?
Queryable Implementation is below
int sec = 10;
Time seconds = Time.seconds(sec);
text.flatMap(new FlatMapFunction<String, WordWithCount>() {
public void flatMap(String value, Collector<WordWithCount> out) {
for (String word : value.split("\\s")) {
out.collect(new WordWithCount(word, 1L));
}
}
})
.keyBy("word")
.timeWindow(seconds)
.reduce(new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) {
System.out.println("After time window fun:- a.word:" + a.word + ", a.count:" + a.count + ", b.word:" + b.word + ", b.count:" + b.count);
return new WordWithCount(a.word, a.count + b.count);
}
})
.keyBy(wordWithCount -> wordWithCount.word)
.asQueryableState("wordCountQuery", valueStateDescriptor)
Whole implementation
SocketWindowWordCountWithQueryableStateWithTimeWindow.java
package com.nirav.modi;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.state.ReducingStateDescriptor;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class SocketWindowWordCountWithQueryableStateWithTimeWindow {
public static void main(String[] args) throws Exception {
// the port to connect to
final int port;
try {
final ParameterTool params = ParameterTool.fromArgs(args);
port = params.getInt("port");
} catch (Exception e) {
System.err.println("No port specified. Please run 'SocketWindowWordCount --port <port>'");
return;
}
// get the execution environment
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE);
// get input data by connecting to the socket
DataStream<String> text = env.socketTextStream("localhost", port);
ReduceFunction<WordWithCount> reduceFunction = new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) {
System.out.println("reduce fun:- a.word:" + a.word + ", a.count:" + a.count + ", b.word:" + b.word + ", b.count:" + b.count);
return new WordWithCount(a.word, a.count + b.count);
}
};
// ReducingStateDescriptor<WordWithCount> descriptor = new ReducingStateDescriptor<WordWithCount>("wordCountQuery", reduceFunction, WordWithCount.class);
ValueStateDescriptor<WordWithCount> valueStateDescriptor = new ValueStateDescriptor<WordWithCount>("wordCountQuery", WordWithCount.class);
int sec = 10;
Time seconds = Time.seconds(sec);
text.flatMap(new FlatMapFunction<String, WordWithCount>() {
public void flatMap(String value, Collector<WordWithCount> out) {
for (String word : value.split("\\s")) {
out.collect(new WordWithCount(word, 1L));
}
}
})
.keyBy("word")
.timeWindow(seconds)
.reduce(new ReduceFunction<WordWithCount>() {
public WordWithCount reduce(WordWithCount a, WordWithCount b) {
System.out.println("After time window fun:- a.word:" + a.word + ", a.count:" + a.count + ", b.word:" + b.word + ", b.count:" + b.count);
return new WordWithCount(a.word, a.count + b.count);
}
}).keyBy(wordWithCount -> wordWithCount.word)
.asQueryableState("wordCountQuery", valueStateDescriptor);
env.getConfig().enableSysoutLogging();
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
System.out.println("[info] Window WordCount with Time Window Job ID: " + jobGraph.getJobID());
System.out.println();
env.execute("Socket Window WordCount with Time Window of " + sec + " seconds");
}
// Data type for words with count
public static class WordWithCount {
public String word;
public long count;
public WordWithCount() {
}
public WordWithCount(String word, long count) {
this.word = word;
this.count = count;
}
#Override
public String toString() {
return word + " : " + count;
}
}
}
QueryStateWithWindowTest.java
package com.nirav.modi;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.queryablestate.client.QueryableStateClient;
import scala.tools.jline_embedded.console.ConsoleReader;
import java.io.PrintWriter;
import java.net.UnknownHostException;
import java.util.concurrent.CompletableFuture;
public class QueryStateWithWindowTest {
public static void main(String[] args) throws Exception {
// the jobId to connect to
final String jobId;
final String queryableStateName;
try {
final ParameterTool params = ParameterTool.fromArgs(args);
jobId = params.get("jobId");
queryableStateName = params.get("queryableStateName");
} catch (Exception e) {
System.err.println("No jobId specified. Please run 'SocketWindowWordCount --jobId <jobId>'");
return;
}
try {
ValueStateDescriptor<SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount> valueStateDescriptor = new ValueStateDescriptor<SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount>("wordCountQuery", SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount.class);
QueryableStateClient client = new QueryableStateClient("truecomtelesoft", 9069);
ExecutionConfig config = new ExecutionConfig();
client.setExecutionConfig(config.enableClosureCleaner());
ConsoleReader reader = new ConsoleReader();
reader.setPrompt("$ ");
PrintWriter out = new PrintWriter(reader.getOutput());
String line;
while ((line = reader.readLine()) != null) {
String key = line.toLowerCase().trim();
out.printf("[info] Querying key '%s'\n", key);
try {
long start = System.currentTimeMillis();
CompletableFuture<ValueState<SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount>> kvState = client.getKvState(JobID.fromHexString(jobId), queryableStateName, key, BasicTypeInfo.STRING_TYPE_INFO, valueStateDescriptor);
try {
SocketWindowWordCountWithQueryableStateWithTimeWindow.WordWithCount wordWithCount = kvState.get().value();
long end = System.currentTimeMillis();
long duration = Math.max(0, end - start);
out.printf("%d (query took %d ms)\n", wordWithCount.count, duration);
} catch (Exception e) {
e.printStackTrace();
}
} catch (Exception e) {
out.println("Query failed because of the following Exception:");
e.printStackTrace(out);
}
}
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
}
The succinct answer to "when will state created by asQueryableState expire?" is never.
asQueryableState gets translated to an operator which uses the incoming records to update a queryable state instance via ValueState.update(value). These values never expire, but are overwritten as new records arrive for a given key. In your test application, this means that the queries are going to return the most recent non-zero count for the given word.
Clearly this isn't what you were trying to accomplish. You could use a ProcessFunction to expire stale entries. To do that, you could explicitly create your own keyed managed state, and store with each count the timestamp of the window that most recently updated the entry. Then you would use a Timer to clear older entries.
See this example of ProcessFunction. To expire the state (which this example doesn't do), call state.clear().

Shuffle list to maximise distances between similar elements

In a list of URLs
http://a.com/foo
http://b.com/bar
http://a.com/monkey
http://c.com/prune
http://a.com/bear
http://b.com/walrus
http://b.com/baz
http://b.com/plugh
I want to maximise the distance between any pair of a.com's, any pair of b.com's etc. This needs to be cheap but does not have to be optimum. (I am using a list of URLs to download files from websites a.com, b.com, c.com, and do not wish to visit any particular site with a higher frequency than necessary. In the example here, we would hit the b.com site 3 times in succession, which should be avoided.)
I would ideally like a Java library but would settle for pseudocode.
Maximise sum of pairwise distances in array seems to be a similar problem but didn't have a simple answer - I simply want something that's "good enough"
Since no answers, I wrote my own. It's very crude but works. It reads a list of URLs, extracts the hosts, counts them and then fills a pigeon-hole array with indexes proportional to the inverse frequency of the hosts.
package org.xmlcml.cmine.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
public class URLShuffler {
public static final Logger LOG = Logger.getLogger(URLShuffler.class);
static {
LOG.setLevel(Level.DEBUG);
}
// in case we needed extra pigeonholes but it doesn't seem to for medium problems
private static int TOL = 1;
private List<String> urls;
private Multiset<String> domains;
private Map<String, Integer> currentIndexByDomain;
private Map<String, Integer> countByDomain;
private List<String> outputUrls;
public URLShuffler() {
}
public void readURLs(List<String> urls) {
this.urls= urls;
domains = HashMultiset.create();
for (String url : urls) {
String domain = getDomain(url);
domains.add(domain);
}
LOG.debug(domains);
}
// this would be better using java.net.URL
private String getDomain(String url) {
int idx = url.indexOf("//");
if (idx != -1) {
url = url.substring(idx+2);
}
idx = url.indexOf("/");
String domain = url.substring(0, idx);
return domain;
}
public List<String> getShuffledUrls() {
currentIndexByDomain = new HashMap<String, Integer>();
countByDomain = new HashMap<String, Integer>();
outputUrls = new ArrayList<String>();
for (int i = 0; i < urls.size() * TOL; i++) {
outputUrls.add("");
}
// this is a convenience method wrapping Guava sort.
for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
LOG.debug(entry);
countByDomain.put(entry.getElement(), entry.getCount());
currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
}
for (String url : urls) {
String domain = getDomain(url);
Integer currentIndex = currentIndexByDomain.get(domain);
Integer count = countByDomain.get(domain);
int slot = (urls.size() * currentIndex * TOL) / count;
currentIndexByDomain.put(domain, currentIndex - 1);
addUrl(url, slot);
}
return outputUrls;
}
private void addUrl(String url, int slot) {
boolean filled = fillLower(url, slot);
if (!filled) {
fillUpper(url, slot);
}
}
// if slot is not free run upwards till next free slot
private boolean fillUpper(String url, int slot) {
for (int i = slot; i < outputUrls.size(); i++) {
if (fill(url, i)) {
return true;
}
}
return false;
}
// if slot is not free run downwards till next free slot
private boolean fillLower(String url, int slot) {
for (int i = slot; i >= 0; i--) {
if (fill(url, i)) {
return true;
}
}
return false;
}
private boolean fill(String url, int slot) {
if (outputUrls.get(slot).equals("")) {
outputUrls.set(slot, url);
return true;
}
return false;
}
}
```

Resources