Suppose we have a stream of data with this format:
example of input data stream:
case class InputElement(key:String,objectType:String,value:Boolean)
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
val inputStream:DataSet[InputElement] = env.fromElements(
InputElement("k1","t1",true)
,InputElement("k2","t1",true)
,InputElement("k2","t2",true)
,InputElement("k1","t2",false)
,InputElement("k1","t2",true)
,InputElement("k1","t1",false)
,InputElement("k2","t2",false)
)
it is semantically equal to have these streams:
val inputStream_k1_t1 = env.fromElements(
InputElement("k1","t1",true),
InputElement("k1","t1",false)
)
val inputStream_k1_t2 = env.fromElements(
InputElement("k1","t2",false),
,InputElement("k1","t2",true)
)
val inputStream_k2_t1 = env.fromElements(
InputElement("k2","t1",true)
)
val inputStream_k2_t2 = env.fromElements(
InputElement("k2","t2",true),
InputElement("k2","t2",false)
)
I want to have an output type like this:
case class OutputElement(key:String,values:Map[String,Boolean])
expected output data stream for the example input data:
val expectedOutputStream = env.fromElements(
OutputElement("k1",Map( "t1"->true ,"t2"->false)),
OutputElement("k2",Map("t1"->true,"t2"->true)),
OutputElement("k1",Map("t1"->false,"t2"->true)),
OutputElement("k2",Map("t2"->false))
)
==========================================
edit 1:
after some considerations about the problem the subject of the question changed:
I want to have another input stream that shows which keys are subscribed to which object types:
case class SubscribeRule(strategy:String,patterns:Set[String])
val subscribeStream: DataStream[SubscribeRule] = env.fromElements(
SubscribeRule("s1",Set("p1","p2")),
SubscribeRule("s2",Set("p1","p2"))
)
now I want to have this output:
(the result stream does not emit any thing till all the subscribed objectType are received:
val expectedOutputStream = env.fromElements(
OutputElement("k1",Map( "t1"->true ,"t2"->false)),
OutputElement("k2",Map("t1"->true,"t2"->true)),
OutputElement("k1",Map("t1"->false,"t2"->true)),
// OutputElement("k2",Map("t2"->false)) # this element will emit when a k2-t1 input message recieved
)
App.scala:
import org.apache.flink.api.common.state.MapStateDescriptor
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment}
object App {
case class updateStateResult(updatedState:Map[String,List[Boolean]],output:Map[String,Boolean])
case class InputElement(key:String,objectType:String,passed:Boolean)
case class SubscribeRule(strategy:String,patterns:Set[String])
case class OutputElement(key:String,result:Map[String,Boolean])
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// checkpoint every 10 seconds
val subscribeStream: DataStream[SubscribeRule] = env.fromElements(
SubscribeRule("s1",Set("p1","p2")),
SubscribeRule("s2",Set("p1","p2"))
)
val broadcastStateDescriptor =
new MapStateDescriptor[String, Set[String]]("subscribes", classOf[String], classOf[Set[String]])
val subscribeStreamBroadcast: BroadcastStream[SubscribeRule] =
subscribeStream.broadcast(broadcastStateDescriptor)
val inputStream = env.fromElements(
InputElement("s1","p1",true),
InputElement("s1","p2",true),
InputElement("s2","p1",false),
InputElement("s2","p2",true),
InputElement("s2","p2",false),
InputElement("s1","p1",false),
InputElement("s2","p1",true),
InputElement("s1","p2",true),
)
val expected = List(
OutputElement("s1",Map("p2"->true,"p1"->true)),
OutputElement("s2",Map("p2"->true,"p1"->false)),
OutputElement("s2",Map("p2"->false,"p1"->true)),
OutputElement("s1",Map("p2"->true,"p1"->false))
)
val keyedInputStream: KeyedStream[InputElement, String] = inputStream.keyBy(_.key)
val result = keyedInputStream
.connect(subscribeStreamBroadcast)
.process(new ZippingFunc())
result.print
env.execute("test stream")
}
}
ZippingFunc.scala
import App.{InputElement, OutputElement, SubscribeRule, updateStateResult}
import org.apache.flink.api.common.state.{ MapState, MapStateDescriptor, ReadOnlyBroadcastState}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction
import org.apache.flink.util.Collector
import java.util.{Map => JavaMap}
import scala.collection.JavaConverters.{iterableAsScalaIterableConverter, mapAsJavaMapConverter}
class ZippingFunc extends KeyedBroadcastProcessFunction[String, InputElement,SubscribeRule , OutputElement] {
private var localState: MapState[String,List[Boolean]] = _
private lazy val broadcastStateDesc =
new MapStateDescriptor[String, Set[String]]("subscribes", classOf[String], classOf[Set[String]])
override def open(parameters: Configuration) {
val localStateDesc: MapStateDescriptor[String,List[Boolean]] =
new MapStateDescriptor[String, List[Boolean]]("sourceMap1", classOf[String], classOf[List[Boolean]])
localState = getRuntimeContext.getMapState(localStateDesc)
}
def updateVar(objectType:String,value:Boolean): Option[Map[String, Boolean]] ={
val values = localState.get(objectType)
localState.put(objectType, value::values)
pickOutputs(localState.entries().asScala).map((ur: updateStateResult) => {
localState.putAll(ur.updatedState.asJava)
ur.output
})
}
def pickOutputs(entries: Iterable[JavaMap.Entry[String, List[Boolean]]]): Option[updateStateResult] = {
val mapped: Iterable[Option[(String, Boolean, List[Boolean])]] = entries.map(
(x: JavaMap.Entry[String, List[Boolean]]) => {
val key: String = x.getKey
val value: List[Boolean] = x.getValue
val head: Option[Boolean] = value.headOption
head.map(
h => {
(key, h, value.tail)
}
)
}
)
sequenceOption(mapped).map((x: List[(String, Boolean, List[Boolean])]) => {
updateStateResult(
x.map(y => (y._1, y._3)).toMap,
x.map(y => (y._1, y._2)).toMap
)
}
)
}
def sequenceOption[A](l:Iterable[Option[A]]): Option[List[A]] =
{
l.foldLeft[Option[List[A]]](Some(List.empty[A]))(
(acc: Option[List[A]], e: Option[A]) =>{
for {
xs <- acc
x <- e
} yield x :: xs
}
)
}
override def processElement(value: InputElement, ctx: KeyedBroadcastProcessFunction[String, InputElement, SubscribeRule, OutputElement]#ReadOnlyContext, out: Collector[OutputElement]): Unit = {
val bs: ReadOnlyBroadcastState[String, Set[String]] = ctx.getBroadcastState(broadcastStateDesc)
if(bs.contains(value.key)) {
val allPatterns: Set[String] = bs.get(value.key)
allPatterns.map((patternName: String) =>
if (!localState.contains(patternName))
localState.put(patternName, List.empty)
)
updateVar(value.objectType, value.passed)
.map((r: Map[String, Boolean]) =>
out.collect(OutputElement(value.key, r))
)
}
}
// )
override def processBroadcastElement(value: SubscribeRule, ctx: KeyedBroadcastProcessFunction[String, InputElement, SubscribeRule, OutputElement]#Context, out: Collector[OutputElement]): Unit = {
val bs = ctx.getBroadcastState(broadcastStateDesc)
bs.put(value.strategy,value.patterns)
}
}
A Flink Streaming was developed with a filter that does the deduplication based on the id of the event using a key-value state based on RocksDB state backend.
Application Code
env.setStateBackend(new RocksDBStateBackend(checkpoint, true).asInstanceOf[StateBackend])
val stream = env
.addSource(kafkaConsumer)
.keyBy(_.id)
.filter(new Deduplication[Stream]("stream-dedup", Time.days(30))).uid("stream-filter")
Deduplication Code
class Deduplication[T](stateDescriptor: String, time: Time) extends RichFilterFunction[T] {
val ttlConfig: StateTtlConfig = StateTtlConfig
.newBuilder(time)
.setUpdateType(StateTtlConfig.UpdateType.OnReadAndWrite)
.setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
.cleanupFullSnapshot
.build
val deduplicationStateDescriptor = new ValueStateDescriptor[Boolean](stateDescriptor, classOf[Boolean])
deduplicationStateDescriptor.enableTimeToLive(ttlConfig)
lazy val deduplicationState: ValueState[Boolean] = getRuntimeContext.getState(deduplicationStateDescriptor)
override def filter(value: T): Boolean = {
if (deduplicationState.value) {
false
} else {
deduplicationState.update(true)
true
}
}
}
All of this works just fine. My goal with this question is to understand how I can read all the state using state processor api. So I started to write some code based on the documentation available.
Savepoint Reading Code
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val savepoint = Savepoint
.load(env, savepointPath,new RocksDBStateBackend("file:/tmp/rocksdb", true))
savepoint
.readKeyedState("stream-filter", new DeduplicationStateReader("stream-dedup")).print()
Reader Function Code
class DeduplicationStateReader(stateDescriptor: String) extends KeyedStateReaderFunction[String, String] {
var state: ValueState[Boolean] = _
override def open(parameters: Configuration): Unit = {
val deduplicationStateDescriptor = new ValueStateDescriptor[Boolean](stateDescriptor, classOf[Boolean])
state = getRuntimeContext.getState(deduplicationStateDescriptor)
}
override def readKey(key: String, ctx: KeyedStateReaderFunction.Context, out: Collector[String]): Unit = {
out.collect("IT IS WORKING")
}
}
Whenever I try to read the state, a serialization error appears to me.
Is there anything wrong? Did I misunderstand all of this?
I have the following simple flink application running within IDE, and I do a checkpoint every 5 seconds, and would like to write the checkpoint data into directory file:///d:/applog/out/mycheckpoint/, but after running for a while, i stop the application,but I didn't find anything under the directory file:///d:/applog/out/mycheckpoint/
The code is:
import java.util.Date
import io.github.streamingwithflink.util.DateUtil
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.scala._
import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext}
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object SourceFunctionExample {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(4)
env.getCheckpointConfig.setCheckpointInterval(5 * 1000)
env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
env.setStateBackend(new FsStateBackend("file:///d:/applog/out/mycheckpoint/"))
val numbers: DataStream[Long] = env.addSource(new ReplayableCountSource)
numbers.print()
env.execute()
}
}
class ReplayableCountSource extends SourceFunction[Long] with CheckpointedFunction {
var isRunning: Boolean = true
var cnt: Long = _
var offsetState: ListState[Long] = _
override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning && cnt < Long.MaxValue) {
ctx.getCheckpointLock.synchronized {
// increment cnt
cnt += 1
ctx.collect(cnt)
}
Thread.sleep(200)
}
}
override def cancel(): Unit = isRunning = false
override def snapshotState(snapshotCtx: FunctionSnapshotContext): Unit = {
println("snapshotState is called at " + DateUtil.format(new Date) + s", cnt is ${cnt}")
// remove previous cnt
offsetState.clear()
// add current cnt
offsetState.add(cnt)
}
override def initializeState(initCtx: FunctionInitializationContext): Unit = {
// obtain operator list state to store the current cnt
val desc = new ListStateDescriptor[Long]("offset", classOf[Long])
offsetState = initCtx.getOperatorStateStore.getListState(desc)
// initialize cnt variable from the checkpoint
val it = offsetState.get()
cnt = if (null == it || !it.iterator().hasNext) {
-1L
} else {
it.iterator().next()
}
println("initializeState is called at " + DateUtil.format(new Date) + s", cnt is ${cnt}")
}
}
I tested the application on Windows and Linux and in both cases the checkpoint files were created as expected.
Note that the program keeps running if a checkpoint fails, for example due to some permission errors or invalid path.
Flink logs a WARN message with the exception that caused the checkpoint to fail.
Im trying to write a simple akka stream rest endpoint and client for consuming this stream. But then i try to run server and client, client is able to consume only part of stream. I can't see any exception during execution.
Here are my server and client:
import akka.NotUsed
import akka.actor.ActorSystem
import akka.http.scaladsl.Http
import akka.http.scaladsl.common.{EntityStreamingSupport, JsonEntityStreamingSupport}
import akka.http.scaladsl.server.Directives._
import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport
import akka.stream.{ActorAttributes, ActorMaterializer, Attributes, Supervision}
import akka.stream.scaladsl.{Flow, Source}
import akka.util.ByteString
import spray.json.DefaultJsonProtocol
import scala.io.StdIn
import scala.util.Random
object WebServer {
object Model {
case class Person(id: Int = Random.nextInt(), fName: String = Random.nextString(10), sName: String = Random.nextString(10))
}
object JsonProtocol extends SprayJsonSupport with DefaultJsonProtocol {
implicit val personFormat = jsonFormat(Model.Person.apply, "id", "firstName", "secondaryName")
}
def main(args: Array[String]) {
implicit val system = ActorSystem("my-system")
implicit val materializer = ActorMaterializer()
implicit val executionContext = system.dispatcher
val start = ByteString.empty
val sep = ByteString("\n")
val end = ByteString.empty
import JsonProtocol._
implicit val jsonStreamingSupport: JsonEntityStreamingSupport = EntityStreamingSupport.json()
.withFramingRenderer(Flow[ByteString].intersperse(start, sep, end))
.withParallelMarshalling(parallelism = 8, unordered = false)
val decider: Supervision.Decider = {
case ex: Throwable => {
println("Exception occurs")
ex.printStackTrace()
Supervision.Resume
}
}
val persons: Source[Model.Person, NotUsed] = Source.fromIterator(
() => (0 to 1000000).map(id => Model.Person(id = id)).iterator
)
.withAttributes(ActorAttributes.supervisionStrategy(decider))
.map(p => { println(p); p })
val route =
path("persons") {
get {
complete(persons)
}
}
val bindingFuture = Http().bindAndHandle(route, "localhost", 8080)
println(s"Server online at http://localhost:8080/\nPress RETURN to stop...")
StdIn.readLine()
bindingFuture
.flatMap(_.unbind())
.onComplete(_ => {
println("Stopping http server ...")
system.terminate()
})
}
}
and client:
import akka.actor.ActorSystem
import akka.http.scaladsl.Http
import akka.http.scaladsl.model.{HttpRequest, Uri}
import akka.stream.{ActorAttributes, ActorMaterializer, Supervision}
import scala.util.{Failure, Success}
object WebClient {
def main(args: Array[String]): Unit = {
implicit val system = ActorSystem()
implicit val materializer = ActorMaterializer()
implicit val executionContext = system.dispatcher
val request = HttpRequest(uri = Uri("http://localhost:8080/persons"))
val response = Http().singleRequest(request)
val attributes = ActorAttributes.withSupervisionStrategy {
case ex: Throwable => {
println("Exception occurs")
ex.printStackTrace
Supervision.Resume
}
}
response.map(r => {
r.entity.dataBytes.withAttributes(attributes)
}).onComplete {
case Success(db) => db.map(bs => bs.utf8String).runForeach(println)
case Failure(ex) => ex.printStackTrace()
}
}
}
it works for 100, 1000, 10 000 persons but does not work for > 100 000'.
It looks like there is some limit for stream but i can't find it
Last record has been printed by server on my local machine is (with number 79101):
Person(79101,ⰷ瑐劲죗醂竜泲늎制䠸,䮳硝沢并⎗ᝨᫌꊭᐽ酡)
Last record on client is(with number 79048):
{"id":79048,"firstName":"췁頔䚐龫暀࡙頨捜昗㢵","secondaryName":"⏉ݾ袈庩컆◁ꄹ葪䑥Ϻ"}
Maybe somebody know why it happens?
I found a solution. I have to explicitly add r.entity.withoutSizeLimit() on client and after that all works as expected
I use Source.queue to queue up HttpRequests and throttle it on the client side to download files from a remote server. I understand that Source.queue is not threadsafe and we need to use MergeHub to make it threadsafe. Following is the piece of code that uses Source.queue and uses cachedHostConnectionPool.
import java.io.File
import akka.actor.Actor
import akka.event.Logging
import akka.http.scaladsl.Http
import akka.http.scaladsl.client.RequestBuilding
import akka.http.scaladsl.model.{HttpResponse, HttpRequest, Uri}
import akka.stream._
import akka.stream.scaladsl._
import akka.util.ByteString
import com.typesafe.config.ConfigFactory
import scala.concurrent.{Promise, Future}
import scala.concurrent.duration._
import scala.util.{Failure, Success}
class HttpClient extends Actor with RequestBuilding {
implicit val system = context.system
val logger = Logging(system, this)
implicit lazy val materializer = ActorMaterializer()
val config = ConfigFactory.load()
val remoteHost = config.getString("pool.connection.host")
val remoteHostPort = config.getInt("pool.connection.port")
val queueSize = config.getInt("pool.queueSize")
val throttleSize = config.getInt("pool.throttle.numberOfRequests")
val throttleDuration = config.getInt("pool.throttle.duration")
import scala.concurrent.ExecutionContext.Implicits.global
val connectionPool = Http().cachedHostConnectionPool[Promise[HttpResponse]](host = remoteHost, port = remoteHostPort)
// Construct a Queue
val requestQueue =
Source.queue[(HttpRequest, Promise[HttpResponse])](queueSize, OverflowStrategy.backpressure)
.throttle(throttleSize, throttleDuration.seconds, 1, ThrottleMode.shaping)
.via(connectionPool)
.toMat(Sink.foreach({
case ((Success(resp), p)) => p.success(resp)
case ((Failure(error), p)) => p.failure(error)
}))(Keep.left)
.run()
// Convert Promise[HttpResponse] to Future[HttpResponse]
def queueRequest(request: HttpRequest): Future[HttpResponse] = {
val responsePromise = Promise[HttpResponse]()
requestQueue.offer(request -> responsePromise).flatMap {
case QueueOfferResult.Enqueued => responsePromise.future
case QueueOfferResult.Dropped => Future.failed(new RuntimeException("Queue overflowed. Try again later."))
case QueueOfferResult.Failure(ex) => Future.failed(ex)
case QueueOfferResult.QueueClosed => Future.failed(new RuntimeException("Queue was closed (pool shut down) while running the request. Try again later."))
}
}
def receive = {
case "download" =>
val uri = Uri(s"http://localhost:8080/file_csv.csv")
downloadFile(uri, new File("/tmp/compass_audience.csv"))
}
def downloadFile(uri: Uri, destinationFilePath: File) = {
def fileSink: Sink[ByteString, Future[IOResult]] =
Flow[ByteString].buffer(512, OverflowStrategy.backpressure)
.toMat(FileIO.toPath(destinationFilePath.toPath)) (Keep.right)
// Submit to queue and execute HttpRequest and write HttpResponse to file
Source.fromFuture(queueRequest(Get(uri)))
.flatMapConcat(_.entity.dataBytes)
.via(Framing.delimiter(ByteString("\n"), maximumFrameLength = 10000, allowTruncation = true))
.map(_.utf8String)
.map(d => s"$d\n")
.map(ByteString(_))
.runWith(fileSink)
}
}
However, when I use MergeHub, it returns Sink[(HttpRequest, Promise[HttpResponse]), NotUsed]. I need to extract the response.entity.dataBytes and write the response to a file using a filesink. I am not able figure out how to use MergeHub to achieve this. Any help will be appreciated.
val hub: Sink[(HttpRequest, Promise[HttpResponse]), NotUsed] =
MergeHub.source[(HttpRequest, Promise[HttpResponse])](perProducerBufferSize = queueSize)
.throttle(throttleSize, throttleDuration.seconds, 1, ThrottleMode.shaping)
.via(connectionPool)
.toMat(Sink.foreach({
case ((Success(resp), p)) => p.success(resp)
case ((Failure(error), p)) => p.failure(error)
}))(Keep.left)
.run()
Source.Queue is actually thread safe now. If you want to use MergeHub:
private lazy val poolFlow: Flow[(HttpRequest, Promise[HttpResponse]), (Try[HttpResponse], Promise[HttpResponse]), Http.HostConnectionPool] =
Http().cachedHostConnectionPool[Promise[HttpResponse]](host).tail.head, port, connectionPoolSettings)
val ServerSink =
poolFlow.toMat(Sink.foreach({
case ((Success(resp), p)) => p.success(resp)
case ((Failure(e), p)) => p.failure(e)
}))(Keep.left)
// Attach a MergeHub Source to the consumer. This will materialize to a
// corresponding Sink.
val runnableGraph: RunnableGraph[Sink[(HttpRequest, Promise[HttpResponse]), NotUsed]] =
MergeHub.source[(HttpRequest, Promise[HttpResponse])](perProducerBufferSize = 16).to(ServerSink)
val toConsumer: Sink[(HttpRequest, Promise[HttpResponse]), NotUsed] = runnableGraph.run()
protected[akkahttp] def executeRequest[T](httpRequest: HttpRequest, unmarshal: HttpResponse => Future[T]): Future[T] = {
val responsePromise = Promise[HttpResponse]()
Source.single((httpRequest -> responsePromise)).runWith(toConsumer)
responsePromise.future.flatMap(handleHttpResponse(_, unmarshal))
)
}
}