I am using Apache flink on Kinesis Data Analytics.
Flink Version : 1.13.2
Jave : 1.11
I am consuming json messages from kafka. Sample Input records look as below
null {"plateNumber":"506b9910-74a7-4c3e-a885-b5e9717efe3a","vignetteStickerId":"9e69df3f-d728-4fc8-9b09-42104588f772","currentTimestamp":"2022/04/07 16:19:55","timestamp":1649362795.444459000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"5ffe0326-571e-4b97-8f7b-4f49aebb6993","vignetteStickerId":"6c2e1342-b096-4cc9-a92c-df61571c2c7d","currentTimestamp":"2022/04/07 16:20:00","timestamp":1649362800.638060000,"vehicleType":"CAR","vehicleModelType":"HONDA"}
null {"plateNumber":"d15f49f9-5550-4780-b260-83f3116ba64a","vignetteStickerId":"1366fbfe-7d0a-475f-9249-261ef1dd6de2","currentTimestamp":"2022/04/07 16:20:05","timestamp":1649362805.643749000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA"}
null {"plateNumber":"803508fb-9701-438e-9028-01bb8d96a804","vignetteStickerId":"b534369f-533e-4c15-ac3f-fc28cf0f3aba","currentTimestamp":"2022/04/07 16:20:10","timestamp":1649362810.648813000,"vehicleType":"CAR","vehicleModelType":"FORD"}
I want to aggregate sum these records into 20 seconds window using vehicleType (CAR OR TRUCK) and vehicleModelType (TOYOTA,HONDA or FORD) . SQL Analogy (sum() ,Group by vehicleType, vehicleModelType)
I am using Aggregate function to achieve this.
import static java.util.Objects.isNull;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.springframework.stereotype.Component;
import com.helecloud.streams.demo.model.Vehicle;
import com.helecloud.streams.demo.model.VehicleStatistics;
#Component
public class VehicleStatisticsAggregator implements AggregateFunction<Vehicle, VehicleStatistics, VehicleStatistics> {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public VehicleStatistics createAccumulator() {
System.out.println("Creating Accumulator!!");
return new VehicleStatistics();
}
#Override
public VehicleStatistics add(Vehicle vehicle, VehicleStatistics vehicleStatistics) {
System.out.println("vehicle in add method : " + vehicle);
if (isNull(vehicleStatistics.getVehicleType())) {
vehicleStatistics.setVehicleType(vehicle.getVehicleType());
}
if (isNull(vehicleStatistics.getVehicleModelType())) {
vehicleStatistics.setVehicleModelType(vehicle.getVehicleModelType());
}
// if(isNull(vehicleStatistics.getStart())) {
//
// vehicleStatistics.setStart(vehicle.getTimestamp());
// }
// if(isNull(vehicleStatistics.getCurrentTimestamp())) {
//
// vehicleStatistics.setCurrentTimestamp(vehicle.getCurrentTimestamp());
// }
if (isNull(vehicleStatistics.getCount())) {
vehicleStatistics.setCount(1);
} else {
System.out.println("incrementing count for : vehicleStatistics : " + vehicleStatistics);
vehicleStatistics.setCount(vehicleStatistics.getCount() + 1);
}
vehicleStatistics.setEnd(vehicle.getTimestamp());
System.out.println("vehicleStatistics in add : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics getResult(VehicleStatistics vehicleStatistics) {
System.out.println("vehicleStatistics in getResult : " + vehicleStatistics);
return vehicleStatistics;
}
#Override
public VehicleStatistics merge(VehicleStatistics vehicleStatistics, VehicleStatistics accumulator) {
System.out.println("Coming to merge!!");
VehicleStatistics vs = new VehicleStatistics(
// vehicleStatistics.getStart(),
accumulator.getEnd(),
// vehicleStatistics.getCurrentTimestamp(),
vehicleStatistics.getVehicleType(), vehicleStatistics.getVehicleModelType(),
vehicleStatistics.getCount() + accumulator.getCount());
System.out.println("VehicleStatistics in Merge :" + vs);
return vs;
}
}
In the above code I am also not seeing the merge code being called.
Below is the main processing code
#Service
public class ProcessingService {
#Value("${kafka.bootstrap-servers}")
private String kafkaAddress;
#Value("${kafka.group-id}")
private String kafkaGroupId;
public static final String TOPIC = "flink_input";
public static final String VEHICLE_STATISTICS_TOPIC = "flink_output";
#Autowired
private VehicleDeserializationSchema vehicleDeserializationSchema;
#Autowired
private VehicleStatisticsSerializationSchema vehicleStatisticsSerializationSchema;
#PostConstruct
public void startFlinkStreamProcessing() {
try {
processVehicleStatistic();
} catch (Exception e) {
// log.error("Cannot process", e);
e.printStackTrace();
}
}
public void processVehicleStatistic() {
try {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
FlinkKafkaConsumer<Vehicle> consumer = createVehicleConsumerForTopic(TOPIC, kafkaAddress, kafkaGroupId);
consumer.setStartFromLatest();
System.out.println("Starting to consume!!");
consumer.assignTimestampsAndWatermarks(WatermarkStrategy.forMonotonousTimestamps());
FlinkKafkaProducer<VehicleStatistics> producer = createVehicleStatisticsProducer(VEHICLE_STATISTICS_TOPIC, kafkaAddress);
DataStream<Vehicle> inputMessagesStream = environment.addSource(consumer);
inputMessagesStream
.keyBy((vehicle -> vehicle.getVehicleType().ordinal()))
// .keyBy(vehicle -> vehicle.getVehicleModelType().ordinal())
// .keyBy(new KeySelector<Vehicle, Tuple2<VehicleType, VehicleModelType>>() {
//
// /**
// *
// */
// private static final long serialVersionUID = 1L;
//
// #Override
// public Tuple2<VehicleType, VehicleModelType> getKey(Vehicle vehicle) throws Exception {
// return Tuple2.of(vehicle.getVehicleType(), vehicle.getVehicleModelType());
// }
// })
// .filter(v -> CAR.equals(v.getVehicleType()))
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
// .windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.aggregate(new VehicleStatisticsAggregator())
.addSink(producer);
System.out.println("Adding to Sink!!");
environment.execute("Car Truck Counts By Model");
} catch(Exception e) {
e.printStackTrace();;
}
}
private FlinkKafkaConsumer<Vehicle> createVehicleConsumerForTopic(String topic, String kafkaAddress, String kafkaGroup ) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", kafkaAddress);
properties.setProperty("group.id", kafkaGroup);
return new FlinkKafkaConsumer<>(topic, vehicleDeserializationSchema, properties);
}
private FlinkKafkaProducer<VehicleStatistics> createVehicleStatisticsProducer(String topic, String kafkaAddress){
return new FlinkKafkaProducer<>(kafkaAddress, topic, vehicleStatisticsSerializationSchema);
}
}
I am getting the result as below.
null {"end":1649362835.665466000,"vehicleType":"TRUCK","vehicleModelType":"HONDA","count":3}
null {"end":1649362825.656024000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":1}
null {"end":1649362850.675786000,"vehicleType":"CAR","vehicleModelType":"TOYOTA","count":3}
null {"end":1649362855.677596000,"vehicleType":"TRUCK","vehicleModelType":"TOYOTA","count":1}
But is there a way to validate this ?
Also other question is I am trying to aggregate the result based on multiple keys is AggregateFunction the correct way to do this.
I am asking this as I saw this How can I sum multiple fields in Flink?
So If I have to aggregate sum on multiple fields can aggregate function accomplish the same ?(the way I wrote the code)
Kindly let me know. Thanks in advance.
Merge will only be called if you are using windows that merge -- in other words, if you are using session windows, or a custom merging window.
The correct way to aggregate based on multiple keys is to use keyBy with a composite type, such as Tuple2<VehicleType, VehicleModelType>>. Each time you call keyBy the stream is repartitioned from scratch (and not in addition to any previous partitioning).
I need to save a dinamic List of PropertyBusinessObjects to the Storage, but I didn't find how to do that. I should be able to add and remove items from the list.
The following test code throws java.io.IOException: Object type not supported: CollectiveDAO. I use registerExternalizable().
public class TestSaveListDAOs {
private Form current;
private Resources theme;
public static List<CollectiveDAO> collectivesDB = new ArrayList<>();
public void init(Object context) {
[...]
}
public void start() {
if(current != null){
current.show();
return;
}
Form hi = new Form("Hi World", BoxLayout.y());
hi.add(new Label("Hi World"));
hi.show();
restoreDB();
collectivesDB.add(new CollectiveDAO());
collectivesDB.add(new CollectiveDAO());
saveDB();
restoreDB();
}
public void stop() {
[...]
}
public void destroy() {
}
public static void saveDB() {
for (CollectiveDAO collectiveDAO : collectivesDB) {
collectiveDAO.getPropertyIndex().registerExternalizable();
}
Storage.getInstance().writeObject("CollectivesDB", collectivesDB);
}
private static void restoreDB() {
Object restoredCollectivesDB = Storage.getInstance().readObject("CollectivesDB");
if (restoredCollectivesDB != null) {
collectivesDB = ((List) restoredCollectivesDB);
}
}
}
public class CollectiveDAO implements PropertyBusinessObject {
public final Property<String, CollectiveDAO> collectiveID = new Property<>("collectiveID");
private final PropertyIndex idx = new PropertyIndex(this, "CollectiveDAO",
collectiveID);
#Override
public PropertyIndex getPropertyIndex() {
return idx;
}
}
I'd use JSON rather than Externalizable such as:
public static void saveDB() {
PropertyIndex.storeJSONList("CollectivesDB", collectivesDB);
}
private static void restoreDB() {
collectivesDB = new CollectiveDAO().getPropertyIndex().loadJSONList("CollectivesDB");
}
It's shorter with nicer output. However, if you want to use Externalizable your usage is incorrect. You should remove the for call and instead do:
public void init(Object context) {
new CollectiveDAO().getPropertyIndex().registerExternalizable();
[...]
}
This is needed only once for reading and writing (it was missing for reading). As it registers the new object type (CollectiveDAO).
I am trying to access GAE Memcache and Datastore APIs from Dataflow.
I have followed How to use memcache in dataflow? and setup Remote API https://cloud.google.com/appengine/docs/java/tools/remoteapi
In my pipeline I have written
public static void main(String[] args) throws IOException {
RemoteApiOptions remApiOpts = new RemoteApiOptions()
.server("xxx.appspot.com", 443)
.useApplicationDefaultCredential();
RemoteApiInstaller installer = new RemoteApiInstaller();
installer.install(remApiOpts);
try {
DatastoreConfigManager2.registerConfig("myconfig");
final String topic = DatastoreConfigManager2.getString("pubsub.topic");
final String stagingDir = DatastoreConfigManager2.getString("dataflow.staging");
...
bqRows.apply(BigQueryIO.Write
.named("Insert row")
.to(new SerializableFunction<BoundedWindow, String>() {
#Override
public String apply(BoundedWindow window) {
// The cast below is safe because CalendarWindows.days(1) produces IntervalWindows.
IntervalWindow day = (IntervalWindow) window;
String dataset = DatastoreConfigManager2.getString("dataflow.bigquery.dataset");
String tablePrefix = DatastoreConfigManager2.getString("dataflow.bigquery.tablenametemplate");
String dayString = DateTimeFormat.forPattern("yyyyMMdd")
.print(day.start());
String tableName = dataset + "." + tablePrefix + dayString;
LOG.info("Writing to BigQuery " + tableName);
return tableName;
}
})
where DatastoreConfigManager2 is
public class DatastoreConfigManager2 {
private static final DatastoreService DATASTORE = DatastoreServiceFactory.getDatastoreService();
private static final MemcacheService MEMCACHE = MemcacheServiceFactory.getMemcacheService();
static {
MEMCACHE.setErrorHandler(ErrorHandlers.getConsistentLogAndContinue(Level.INFO));
}
private static Set<String> configs = Sets.newConcurrentHashSet();
public static void registerConfig(String name) {
configs.add(name);
}
private static class DatastoreCallbacks {
// https://cloud.google.com/appengine/docs/java/datastore/callbacks
#PostPut
public void updateCacheOnPut(PutContext context) {
Entity entity = context.getCurrentElement();
if (configs.contains(entity.getKind())) {
String id = (String) entity.getProperty("id");
String value = (String) entity.getProperty("value");
MEMCACHE.put(id, value);
}
}
}
private static String lookup(String id) {
String value = (String) MEMCACHE.get(id);
if (value != null) return value;
else {
for (String config : configs) {
try {
PreparedQuery pq = DATASTORE.prepare(new Query(config)
.setFilter(new FilterPredicate("id", FilterOperator.EQUAL, id)));
for (Entity entity : pq.asIterable()) {
value = (String) entity.getProperty("value"); // use last
}
if (value != null) MEMCACHE.put(id, value);
} catch (Exception e) {
e.printStackTrace();
}
}
}
return value;
}
public static String getString(String id) {
return lookup(id);
}
}
When my pipeline runs on Dataflow I get the exception
Caused by: java.lang.NullPointerException
at com.google.appengine.api.NamespaceManager.get(NamespaceManager.java:101)
at com.google.appengine.api.memcache.BaseMemcacheServiceImpl.getEffectiveNamespace(BaseMemcacheServiceImpl.java:65)
at com.google.appengine.api.memcache.AsyncMemcacheServiceImpl.doGet(AsyncMemcacheServiceImpl.java:401)
at com.google.appengine.api.memcache.AsyncMemcacheServiceImpl.get(AsyncMemcacheServiceImpl.java:412)
at com.google.appengine.api.memcache.MemcacheServiceImpl.get(MemcacheServiceImpl.java:49)
at my.training.google.common.config.DatastoreConfigManager2.lookup(DatastoreConfigManager2.java:80)
at my.training.google.common.config.DatastoreConfigManager2.getString(DatastoreConfigManager2.java:117)
at my.training.google.mss.pipeline.InsertIntoBqWithCalendarWindow$1.apply(InsertIntoBqWithCalendarWindow.java:101)
at my.training.google.mss.pipeline.InsertIntoBqWithCalendarWindow$1.apply(InsertIntoBqWithCalendarWindow.java:95)
at com.google.cloud.dataflow.sdk.io.BigQueryIO$Write$Bound$TranslateTableSpecFunction.apply(BigQueryIO.java:1496)
at com.google.cloud.dataflow.sdk.io.BigQueryIO$Write$Bound$TranslateTableSpecFunction.apply(BigQueryIO.java:1486)
at com.google.cloud.dataflow.sdk.io.BigQueryIO$TagWithUniqueIdsAndTable.tableSpecFromWindow(BigQueryIO.java:2641)
at com.google.cloud.dataflow.sdk.io.BigQueryIO$TagWithUniqueIdsAndTable.processElement(BigQueryIO.java:2618)
Any suggestions? Thanks in advance.
EDIT: my functional requirement is building a pipeline with some configurable steps based on datastore entries.
I tried to use to DbSetup to populate the database, but it did not work
class CommonOperations {
public static final Operation DELETE_ALL =
deleteAllFrom("ACH_GROUP");
public static final Operation INSERT_REFERENCE_DATA =
sequenceOf(
insertInto("ACH_GROUP")
.columns("ID", "CLOSED", "NAME", "OPENED", "COMPETENCE_ID", "LASTMODIFIEDDATE", "uuid")
.values(1, "", "JAVA", "", 1, "", "")
.build());
}
public class CompetenceDaoImplementationTest {
private static String username = "sa";
private static String password = "12345";
private static String URL = "jdbc:sqlserver://localhost:1433;databaseName=DB_Achi";
// the tracker is static because JUnit uses a separate Test instance for every test method.
private static DbSetupTracker dbSetupTracker = new DbSetupTracker();
#Before
public void prepare() throws Exception {
Operation operation =
sequenceOf(
CommonOperations.DELETE_ALL,
CommonOperations.INSERT_REFERENCE_DATA);
// without DataSource
DbSetup dbSetup = new DbSetup(new DriverManagerDestination(URL, username, password), operation);
// use the tracker to launch the DbSetup.
dbSetupTracker.launchIfNecessary(dbSetup);
}
#Test
public void testShowGroups() {
dbSetupTracker.skipNextLaunch();
}
I'm worring about JDO in GAE (Google App Engine). (GWT 2.4 and GAE 1.6.3 SDK and JDO 2.3)
I have a class "Users" which should save a Collection of "User" in a List, but it doesn't work.
When i save my Users-Class, then it creates the "Users"-Object in the Datebase and it also creates the User-Object in the List users. But when i load the Users-Object from the Database, the List users is empty...
Do i have to load the list by my self? I guess that JDO schould load the list directy, when i load the Users-Object.
I need your Help here! Thanks in previous!
Could it be a Problem that i create the Key in abstract class PersistentUser and PersistentUsers?
Could the LinkedList be the Problem?
My Code:
#PersistenceCapable(identityType = IdentityType.APPLICATION, detachable = "true")
#Version(strategy=VersionStrategy.VERSION_NUMBER)
public class Users extends PersistentUsers implements Serializable{
/**
*
*/
private static final long serialVersionUID = -21666269538993247L;
/**
* Mapped from Operator X
*/
#Persistent
private String operatorId;
#Persistent(mappedBy="userlist")
#Element(dependent = "true")
private List<User> users;
/**
*
* List of Ids of Users
*
*/
#Persistent(serialized = "true")
#Element(dependent = "true")
private List<String> userIds;
/**
* #return the users
*/
public List<User> getUsers() {
return users;
}
/**
* #param users the users to set
*/
public void setUsers(List<User> users) {
this.users = users;
}
...
}
The User Class:
#PersistenceCapable(identityType = IdentityType.APPLICATION, detachable = "true")
#Version(strategy=VersionStrategy.VERSION_NUMBER)
public class User extends PersistentUser implements Serializable{
/**
*
*/
private static final long serialVersionUID = 6899284258473985914L;
#Persistent
private String emailAddress;
#Persistent
private UserRole role;
/**
*
* Mapped from Userlist X from Operator Y
*/
#Persistent
private Users userlist;
public User(String email, UserRole role){
this.emailAddress = email;
this.role = role;
}
public String getEmailAddress() {
return emailAddress;
}
public void setEmailAddress(String emailAddress) {
this.emailAddress = emailAddress;
}
public UserRole getRole() {
return role;
}
public void setRole(UserRole role) {
this.role = role;
}
/**
* #return the userlist
*/
public Users getUserlist() {
return userlist;
}
/**
* #param userlist the userlist to set
*/
public void setUserlist(Users userlist) {
this.userlist = userlist;
}
}
PersistentUser and PersistentUsers Class are the same content (but because of JDO-AppEngine Inheritance Problem two seperate classes:
#PersistenceCapable(identityType = IdentityType.APPLICATION, detachable = "true")
#Inheritance(strategy = InheritanceStrategy.SUBCLASS_TABLE)
#Version(strategy=VersionStrategy.VERSION_NUMBER)
public abstract class PersistentUsers implements IPersitentObject {
/**
* Id
*
* Autogenerated String id of the Database
*
*/
#PrimaryKey
#Persistent(valueStrategy = IdGeneratorStrategy.IDENTITY)
protected Key encodedKey;
#Persistent
protected String username;
#Override
public String getId() {
if(encodedKey == null) return null;
return KeyFactory.keyToString(encodedKey);
}
/*public void setId(String id) {
this.encodedKey = id;
}*/
/**
* Helper function - get Version from DB
*/
#Override
public Long getVersion(){
...
}
/**
* Helper function - will save this instance in DB
*/
public void persist(){
...
}
/**
* Helper function - will remove this instance from DB
*/
public void delete(){
...
}
#Override
public final boolean checkUsername() {
...
}
}
Create User Code:
...
if(RequestFactoryServlet.getThreadLocalRequest().getUserPrincipal() != null){
//Create New User
String email = RequestFactoryServlet.getThreadLocalRequest().getUserPrincipal().getName();
User u = UserFactory.getUser(email, UserRole.ADMINISTRATOR);
//u.persist();
//Create New Userlist
Users users = UserFactory.getUsers();
//Get Uids (normally empty)
LinkedList<String> uids = (LinkedList<String>) users.getUserIds();
if(uids==null){
uids = new LinkedList<String>();
}
uids.add(u.getId());
//Get DB-Userlist of current User-List
LinkedList<User> userlist = (LinkedList<User>) users.getUsers();
if(userlist==null){
userlist = new LinkedList<User>();
}
userlist.add(u);
users.setUserIds(uids);
users.setUsers(userlist);
u.setUserlist(users);
//Persit Userlist and Persist User
users.persist();
this.userlistId = users.getId();
}
...
Persistence Code:
public static void persist(IPersitentObject o){
PersistenceManager pm = Pmf.get().getPersistenceManager();
try{
pm.makePersistent(o);
} catch (Exception e) {
e.printStackTrace();
}finally {
pm.close();
}
}
I found the problem/solution
It's my stupid brain thinking i could fetch it while debugging.
My code is correct, but the information is not in the object while debugging!
Test it in a TestCase showed, that it works.
public class UsersTest {
private PersistenceManager pm;
private final LocalServiceTestHelper helper =
new LocalServiceTestHelper(new LocalDatastoreServiceTestConfig());
private String userlistId;
private String userId;
#Before
public void setUp() throws Exception {
helper.setUp();
pm = ch.zhaw.ams.server.core.persistance.Pmf.get().getPersistenceManager();
}
#After
public void tearDown() throws Exception {
}
#Test
public void testNewUsers() {
//New UserList
//Create New Userlist
Users users = UserFactory.getUsers();
//Create New User
String email = "ss";
User u = UserFactory.getUser(email, UserRole.ADMINISTRATOR);
users.getUsers().add(u);
users.persist();
this.userlistId = users.getId();
this.userId = users.getUsers().get(0).getId();
//Test Users
pm = ch.zhaw.ams.server.core.persistance.Pmf.get().getPersistenceManager();
Users ul= pm.getObjectById(Users.class, this.userlistId);
assertNotNull(ul);
assertNotNull(ul.getUsers().get(0));
assertTrue(ul.getUsers().get(0).getId().equals(this.userId));
pm.close();
}
}