My simple Dataflow pipeline successfully copies multiple Kinds from one project's Datastore to another in most cases. But in certain Kinds (about 5% of them), we always get these errors.
Dataflow retries 4-8 times with a delay of about 75 seconds, and then the pipeline fails.
How can I diagnose and resolve this?
EDIT: The answer includes: (1) there was a bug in the Datastore library used by Dataflow; after they fixed this bug, you can see the underlying cause and (2) the default batch size for putting entities in this library is 500, which is also the max, and that goes over the 10 Mb limit of the Datastore API.
The (very simple) Pipeline looks like this:
Query.Builder qb = Query.newBuilder();
qb.addKindBuilder().setName(kindName);
Query query = qb.build();
Read dsRead = DatastoreIO.v1().read().withProjectId(inputProject).withQuery(query);
Write dsWrite = DatastoreIO.v1().write().withProjectId(outputProject);
PCollection<Entity> sourceEntities = pipeline.apply("read", dsRead);
Bound<Entity, Entity> entityFromSrcToTarget = ParDo.of(new EntityDoFn());/*Simple DoFn that copies Entities for insertion to target*/
PCollection<Entity> clonedEntities = sourceEntities.apply("clone-entity", entityFromSrcToTarget);
clonedEntities.apply("write-to-ds", dsWrite);
First stacktrace
com.google.datastore.v1.client.DatastoreException: I/O error, code=UNAVAILABLE at
com.google.datastore.v1.client.RemoteRpc.makeException(RemoteRpc.java:126) at
com.google.datastore.v1.client.RemoteRpc.call(RemoteRpc.java:95) at
com.google.datastore.v1.client.Datastore.commit(Datastore.java:84) at
com.google.cloud.dataflow.sdk.io.datastore.DatastoreV1$DatastoreWriterFn.flushBatch(DatastoreV1.java:925) at
com.google.cloud.dataflow.sdk.io.datastore.DatastoreV1$DatastoreWriterFn.finishBundle(DatastoreV1.java:899) Caused by: java.io.IOException: insufficient data written at
sun.net.www.protocol.http.HttpURLConnection$StreamingOutputStream.close(HttpURLConnection.java:3500) at
com.google.api.client.http.javanet.NetHttpRequest.execute(NetHttpRequest.java:81) at
com.google.api.client.http.HttpRequest.execute(HttpRequest.java:981) at
com.google.datastore.v1.client.RemoteRpc.call(RemoteRpc.java:87) at
com.google.datastore.v1.client.Datastore.commit(Datastore.java:84) at
com.google.cloud.dataflow.sdk.io.datastore.DatastoreV1$DatastoreWriterFn.flushBatch(DatastoreV1.java:925) at
com.google.cloud.dataflow.sdk.io.datastore.DatastoreV1$DatastoreWriterFn.finishBundle(DatastoreV1.java:899) at
com.google.cloud.dataflow.sdk.util.DoFnRunnerBase.finishBundle(DoFnRunnerBase.java:158) at
com.google.cloud.dataflow.sdk.runners.worker.SimpleParDoFn.finishBundle(SimpleParDoFn.java:196) at
com.google.cloud.dataflow.sdk.runners.worker.ForwardingParDoFn.finishBundle(ForwardingParDoFn.java:47) at
com.google.cloud.dataflow.sdk.util.common.worker.ParDoOperation.finish(ParDoOperation.java:65) at
com.google.cloud.dataflow.sdk.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:80) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorker.executeWork(DataflowWorker.java:287) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorker.doWork(DataflowWorker.java:223) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorker.getAndPerformWork(DataflowWorker.java:173) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerHarness$WorkerThread.doWork(DataflowWorkerHarness.java:193) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerHarness$WorkerThread.call(DataflowWorkerHarness.java:173) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerHarness$WorkerThread.call(DataflowWorkerHarness.java:160) at
java.util.concurrent.FutureTask.run(FutureTask.java:266) at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at
java.lang.Thread.run(Thread.java:745)
Also
(9908b474b1492772): java.lang.RuntimeException:
com.google.cloud.dataflow.sdk.util.UserCodeException: java.lang.RuntimeException:
com.google.cloud.dataflow.sdk.util.UserCodeException: java.lang.RuntimeException:
com.google.cloud.dataflow.sdk.util.UserCodeException: java.lang.RuntimeException:
com.google.cloud.dataflow.sdk.util.UserCodeException: java.lang.RuntimeException:
com.google.cloud.dataflow.sdk.util.UserCodeException: java.lang.RuntimeException:
com.google.cloud.dataflow.sdk.util.UserCodeException:
com.google.datastore.v1.client.DatastoreException: I/O error, code=UNAVAILABLE at
com.google.cloud.dataflow.sdk.runners.worker.SimpleParDoFn$1.output(SimpleParDoFn.java:162) at
com.google.cloud.dataflow.sdk.util.DoFnRunnerBase$DoFnContext.outputWindowedValue(DoFnRunnerBase.java:287) at
com.google.cloud.dataflow.sdk.util.DoFnRunnerBase$DoFnContext.outputWindowedValue(DoFnRunnerBase.java:283) at
com.google.cloud.dataflow.sdk.util.DoFnRunnerBase$DoFnProcessContext$1.outputWindowedValue(DoFnRunnerBase.java:507) at
com.google.cloud.dataflow.sdk.util.GroupAlsoByWindowsViaIteratorsDoFn.processElement(GroupAlsoByWindowsViaIteratorsDoFn.java:125) at
com.google.cloud.dataflow.sdk.util.SimpleDoFnRunner.invokeProcessElement(SimpleDoFnRunner.java:49) at
com.google.cloud.dataflow.sdk.util.DoFnRunnerBase.processElement(DoFnRunnerBase.java:138) at
com.google.cloud.dataflow.sdk.runners.worker.SimpleParDoFn.processElement(SimpleParDoFn.java:190) at
com.google.cloud.dataflow.sdk.runners.worker.ForwardingParDoFn.processElement(ForwardingParDoFn.java:42) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerLoggingParDoFn.processElement(DataflowWorkerLoggingParDoFn.java:47) at
com.google.cloud.dataflow.sdk.util.common.worker.ParDoOperation.process(ParDoOperation.java:55) at
com.google.cloud.dataflow.sdk.util.common.worker.OutputReceiver.process(OutputReceiver.java:52) at
com.google.cloud.dataflow.sdk.util.common.worker.ReadOperation.runReadLoop(ReadOperation.java:202) at
com.google.cloud.dataflow.sdk.util.common.worker.ReadOperation.start(ReadOperation.java:143) at
com.google.cloud.dataflow.sdk.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:72) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorker.executeWork(DataflowWorker.java:287) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorker.doWork(DataflowWorker.java:223) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorker.getAndPerformWork(DataflowWorker.java:173) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerHarness$WorkerThread.doWork(DataflowWorkerHarness.java:193) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerHarness$WorkerThread.call(DataflowWorkerHarness.java:173) at
com.google.cloud.dataflow.sdk.runners.worker.DataflowWorkerHarness$WorkerThread.call(DataflowWorkerHarness.java:160) at
java.util.concurrent.FutureTask.run(FutureTask.java:266) at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at
java.lang.Thread.run(Thread.java:745)
Related
I am observing a failure whenever I trigger a savepoint on my Flink Application which otherwise runs without issues.
Job Details:
Deployment: AWS Kinesis Data Analytics(Kubernetes)
5 Task Managers
Backend: RocksDB
Kinesis Data Units: 256 KPU
Flink Graph(Parallelism indicated in brackets)
Task Manager Details screenshot
Exception Root Cause on Flink UI:
org.apache.flink.runtime.io.network.netty.exception.RemoteTransportException: Error at remote task manager '142.151.130.161/142.151.130.161:6121'.
at org.apache.flink.runtime.io.network.netty.CreditBasedPartitionRequestClientHandler.decodeMsg(CreditBasedPartitionRequestClientHandler.java:294)
at org.apache.flink.runtime.io.network.netty.CreditBasedPartitionRequestClientHandler.channelRead(CreditBasedPartitionRequestClientHandler.java:183)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
at org.apache.flink.runtime.io.network.netty.NettyMessageClientDecoderDelegate.channelRead(NettyMessageClientDecoderDelegate.java:115)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
at org.apache.flink.shaded.netty4.io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1475)
at org.apache.flink.shaded.netty4.io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1224)
at org.apache.flink.shaded.netty4.io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1271)
at org.apache.flink.shaded.netty4.io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:505)
at org.apache.flink.shaded.netty4.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:444)
at org.apache.flink.shaded.netty4.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:283)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
at org.apache.flink.shaded.netty4.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1421)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
at org.apache.flink.shaded.netty4.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:930)
at org.apache.flink.shaded.netty4.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady(AbstractEpollStreamChannel.java:794)
at org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:424)
at org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:326)
at org.apache.flink.shaded.netty4.io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:918)
at org.apache.flink.shaded.netty4.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.flink.runtime.io.network.partition.ProducerFailedException: java.lang.OutOfMemoryError: unable to create native thread: possibly out of memory or process/resource limits reached
at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue.writeAndFlushNextMessageIfPossible(PartitionRequestQueue.java:224)
at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue.enqueueAvailableReader(PartitionRequestQueue.java:108)
at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue.userEventTriggered(PartitionRequestQueue.java:173)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:341)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:327)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireUserEventTriggered(AbstractChannelHandlerContext.java:319)
at org.apache.flink.shaded.netty4.io.netty.channel.ChannelInboundHandlerAdapter.userEventTriggered(ChannelInboundHandlerAdapter.java:117)
at org.apache.flink.shaded.netty4.io.netty.handler.codec.ByteToMessageDecoder.userEventTriggered(ByteToMessageDecoder.java:369)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:341)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:327)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireUserEventTriggered(AbstractChannelHandlerContext.java:319)
at org.apache.flink.shaded.netty4.io.netty.channel.ChannelInboundHandlerAdapter.userEventTriggered(ChannelInboundHandlerAdapter.java:117)
at org.apache.flink.shaded.netty4.io.netty.handler.codec.ByteToMessageDecoder.userEventTriggered(ByteToMessageDecoder.java:369)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:341)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:327)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireUserEventTriggered(AbstractChannelHandlerContext.java:319)
at org.apache.flink.shaded.netty4.io.netty.channel.DefaultChannelPipeline$HeadContext.userEventTriggered(DefaultChannelPipeline.java:1439)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:341)
at org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeUserEventTriggered(AbstractChannelHandlerContext.java:327)
at org.apache.flink.shaded.netty4.io.netty.channel.DefaultChannelPipeline.fireUserEventTriggered(DefaultChannelPipeline.java:924)
at org.apache.flink.runtime.io.network.netty.PartitionRequestQueue.lambda$notifyReaderNonEmpty$0(PartitionRequestQueue.java:87)
at org.apache.flink.shaded.netty4.io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163)
at org.apache.flink.shaded.netty4.io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:416)
at org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:331)
... 3 more
Caused by: java.lang.OutOfMemoryError: unable to create native thread: possibly out of memory or process/resource limits reached
at java.base/java.lang.Thread.start0(Native Method)
at java.base/java.lang.Thread.start(Thread.java:798)
at java.base/java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:937)
at java.base/java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1354)
at org.apache.flink.streaming.runtime.tasks.SubtaskCheckpointCoordinatorImpl.finishAndReportAsync(SubtaskCheckpointCoordinatorImpl.java:451)
at org.apache.flink.streaming.runtime.tasks.SubtaskCheckpointCoordinatorImpl.checkpointState(SubtaskCheckpointCoordinatorImpl.java:267)
at org.apache.flink.streaming.runtime.tasks.StreamTask.lambda$performCheckpoint$5(StreamTask.java:917)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.runThrowing(StreamTaskActionExecutor.java:47)
at org.apache.flink.streaming.runtime.tasks.StreamTask.performCheckpoint(StreamTask.java:907)
at org.apache.flink.streaming.runtime.tasks.StreamTask.triggerCheckpointOnBarrier(StreamTask.java:873)
at org.apache.flink.streaming.runtime.io.CheckpointBarrierHandler.notifyCheckpoint(CheckpointBarrierHandler.java:113)
at org.apache.flink.streaming.runtime.io.CheckpointBarrierAligner.processBarrier(CheckpointBarrierAligner.java:198)
at org.apache.flink.streaming.runtime.io.CheckpointedInputGate.pollNext(CheckpointedInputGate.java:93)
at org.apache.flink.streaming.runtime.io.StreamTaskNetworkInput.emitNext(StreamTaskNetworkInput.java:158)
at org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processInput(StreamOneInputProcessor.java:67)
at org.apache.flink.streaming.runtime.tasks.StreamTask.processInput(StreamTask.java:346)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxStep(MailboxProcessor.java:191)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:181)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:566)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:537)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:724)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:549)
... 1 more
Any help would be appreciated to debug the issue
I'm running my cluster on kubernetes with a single jobmanager and 2 taskmanagers.
I tested the mechanism of checkpoint by killing one of the taskmanager pods while the job is running.
I got the following exceptions on the jobmanager and the restarted taskmanager:
Jobmanager exception:
java.lang.Exception: Exception while creating StreamOperatorStateContext.
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:195)
at org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)
at org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:881)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:395)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.util.FlinkException: Could not restore keyed state backend for WindowOperator_54288f79b169ee3e8cb1feb33bbad4c3_(1/8) from any of the 1 provided restore options.
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:135)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)
... 6 more
Caused by: org.apache.flink.runtime.state.BackendBuildingException: Caught unexpected exception.
at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:326)
at org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)
... 8 more
Caused by: java.nio.file.NoSuchFileException: /rocksdb/job_0a1a61f5cbecc09fbaef1257b3392b3a_op_WindowOperator_54288f79b169ee3e8cb1feb33bbad4c3__1_8__uuid_8b95eb2f-f6cf-4c35-8274-a9055376163d/db/000021.sst -> /rocksdb/job_0a1a61f5cbecc09fbaef1257b3392b3a_op_WindowOperator_54288f79b169ee3e8cb1feb33bbad4c3__1_8__uuid_8b95eb2f-f6cf-4c35-8274-a9055376163d/f1a97117-3810-400e-85ca-6e8c998a5ed4/000021.sst
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:86)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixFileSystemProvider.createLink(UnixFileSystemProvider.java:476)
at java.nio.file.Files.createLink(Files.java:1086)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreInstanceDirectoryFromPath(RocksDBIncrementalRestoreOperation.java:473)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreFromLocalState(RocksDBIncrementalRestoreOperation.java:212)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreFromRemoteState(RocksDBIncrementalRestoreOperation.java:188)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreWithoutRescaling(RocksDBIncrementalRestoreOperation.java:162)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restore(RocksDBIncrementalRestoreOperation.java:148)
at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:270)
... 12 more
Taskmanager exception:
2020-01-13 09:26:01,943 ERROR org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder - Caught unexpected exception.
org.apache.flink.fs.s3base.shaded.com.amazonaws.SdkClientException: Failed to sanitize XML document destined for handler class org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser$ListBucketHandler
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.sanitizeXmlDocument(XmlResponsesSaxParser.java:219)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.parseListBucketObjectsResponse(XmlResponsesSaxParser.java:317)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.Unmarshallers$ListObjectsUnmarshaller.unmarshall(Unmarshallers.java:70)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.Unmarshallers$ListObjectsUnmarshaller.unmarshall(Unmarshallers.java:59)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.internal.S3XmlResponseHandler.handle(S3XmlResponseHandler.java:62)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.internal.S3XmlResponseHandler.handle(S3XmlResponseHandler.java:31)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.response.AwsResponseHandlerAdapter.handle(AwsResponseHandlerAdapter.java:70)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleResponse(AmazonHttpClient.java:1554)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1272)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1056)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4325)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4272)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4266)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.listObjects(AmazonS3Client.java:834)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.listPrefix(PrestoS3FileSystem.java:484)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.access$000(PrestoS3FileSystem.java:112)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem$1.<init>(PrestoS3FileSystem.java:271)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.listLocatedStatus(PrestoS3FileSystem.java:269)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.listStatus(PrestoS3FileSystem.java:258)
at org.apache.flink.fs.s3.common.hadoop.HadoopFileSystem.listStatus(HadoopFileSystem.java:157)
at org.apache.flink.core.fs.SafetyNetWrapperFileSystem.listStatus(SafetyNetWrapperFileSystem.java:97)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreInstanceDirectoryFromPath(RocksDBIncrementalRestoreOperation.java:460)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreFromLocalState(RocksDBIncrementalRestoreOperation.java:212)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreFromRemoteState(RocksDBIncrementalRestoreOperation.java:188)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreWithoutRescaling(RocksDBIncrementalRestoreOperation.java:162)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restore(RocksDBIncrementalRestoreOperation.java:148)
at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:270)
at org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)
at org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)
at org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:881)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:395)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.fs.s3base.shaded.com.amazonaws.AbortedException:
at org.apache.flink.fs.s3base.shaded.com.amazonaws.internal.SdkFilterInputStream.abortIfNeeded(SdkFilterInputStream.java:53)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.internal.SdkFilterInputStream.read(SdkFilterInputStream.java:81)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.event.ProgressInputStream.read(ProgressInputStream.java:180)
at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
at java.io.InputStreamReader.read(InputStreamReader.java:184)
at java.io.BufferedReader.read1(BufferedReader.java:210)
at java.io.BufferedReader.read(BufferedReader.java:286)
at java.io.Reader.read(Reader.java:140)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.sanitizeXmlDocument(XmlResponsesSaxParser.java:191)
... 44 more
2020-01-13 09:26:01,944 WARN org.apache.flink.streaming.api.operators.BackendRestorerProcedure - Exception while restoring keyed state backend for WindowOperator_54288f79b169ee3e8cb1feb33bbad4c3_(7/8) from alternative (1/1), will retry while more alternatives are available.
org.apache.flink.runtime.state.BackendBuildingException: Caught unexpected exception.
at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:326)
at org.apache.flink.contrib.streaming.state.RocksDBStateBackend.createKeyedStateBackend(RocksDBStateBackend.java:520)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.lambda$keyedStatedBackend$1(StreamTaskStateInitializerImpl.java:291)
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.attemptCreateAndRestore(BackendRestorerProcedure.java:142)
at org.apache.flink.streaming.api.operators.BackendRestorerProcedure.createAndRestore(BackendRestorerProcedure.java:121)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.keyedStatedBackend(StreamTaskStateInitializerImpl.java:307)
at org.apache.flink.streaming.api.operators.StreamTaskStateInitializerImpl.streamOperatorStateContext(StreamTaskStateInitializerImpl.java:135)
at org.apache.flink.streaming.api.operators.AbstractStreamOperator.initializeState(AbstractStreamOperator.java:253)
at org.apache.flink.streaming.runtime.tasks.StreamTask.initializeState(StreamTask.java:881)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:395)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:705)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:530)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.fs.s3base.shaded.com.amazonaws.SdkClientException: Failed to sanitize XML document destined for handler class org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser$ListBucketHandler
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.sanitizeXmlDocument(XmlResponsesSaxParser.java:219)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.parseListBucketObjectsResponse(XmlResponsesSaxParser.java:317)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.Unmarshallers$ListObjectsUnmarshaller.unmarshall(Unmarshallers.java:70)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.Unmarshallers$ListObjectsUnmarshaller.unmarshall(Unmarshallers.java:59)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.internal.S3XmlResponseHandler.handle(S3XmlResponseHandler.java:62)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.internal.S3XmlResponseHandler.handle(S3XmlResponseHandler.java:31)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.response.AwsResponseHandlerAdapter.handle(AwsResponseHandlerAdapter.java:70)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleResponse(AmazonHttpClient.java:1554)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1272)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1056)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4325)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4272)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4266)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.AmazonS3Client.listObjects(AmazonS3Client.java:834)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.listPrefix(PrestoS3FileSystem.java:484)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.access$000(PrestoS3FileSystem.java:112)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem$1.<init>(PrestoS3FileSystem.java:271)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.listLocatedStatus(PrestoS3FileSystem.java:269)
at org.apache.flink.fs.s3presto.shaded.com.facebook.presto.hive.s3.PrestoS3FileSystem.listStatus(PrestoS3FileSystem.java:258)
at org.apache.flink.fs.s3.common.hadoop.HadoopFileSystem.listStatus(HadoopFileSystem.java:157)
at org.apache.flink.core.fs.SafetyNetWrapperFileSystem.listStatus(SafetyNetWrapperFileSystem.java:97)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreInstanceDirectoryFromPath(RocksDBIncrementalRestoreOperation.java:460)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreFromLocalState(RocksDBIncrementalRestoreOperation.java:212)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreFromRemoteState(RocksDBIncrementalRestoreOperation.java:188)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restoreWithoutRescaling(RocksDBIncrementalRestoreOperation.java:162)
at org.apache.flink.contrib.streaming.state.restore.RocksDBIncrementalRestoreOperation.restore(RocksDBIncrementalRestoreOperation.java:148)
at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackendBuilder.build(RocksDBKeyedStateBackendBuilder.java:270)
... 12 more
Caused by: org.apache.flink.fs.s3base.shaded.com.amazonaws.AbortedException:
at org.apache.flink.fs.s3base.shaded.com.amazonaws.internal.SdkFilterInputStream.abortIfNeeded(SdkFilterInputStream.java:53)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.internal.SdkFilterInputStream.read(SdkFilterInputStream.java:81)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.event.ProgressInputStream.read(ProgressInputStream.java:180)
at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
at java.io.InputStreamReader.read(InputStreamReader.java:184)
at java.io.BufferedReader.read1(BufferedReader.java:210)
at java.io.BufferedReader.read(BufferedReader.java:286)
at java.io.Reader.read(Reader.java:140)
at org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser.sanitizeXmlDocument(XmlResponsesSaxParser.java:191)
... 44 more
When I tried to restore from a savepoint, everything works as expected.
Any idea?
From our experience, one possible cause could be you come across below exception first:
Caused by: org.apache.flink.fs.s3base.shaded.com.amazonaws.SdkClientException: Failed to sanitize XML document destined for handler class org.apache.flink.fs.s3base.shaded.com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser$ListBucketHandler
Then the procedure to restore rocksdb state-backend would be interrupted leading to file /rocksdb/job_0a1a61f5cbecc09fbaef1257b3392b3a_op_WindowOperator_54288f79b169ee3e8cb1feb33bbad4c3__1_8__uuid_8b95eb2f-f6cf-4c35-8274-a9055376163d/f1a97117-3810-400e-85ca-6e8c998a5ed4/000021.sst deleted in https://github.com/apache/flink/blob/390926e61aeb69837c70a024ad6e7ff02eccdf2d/flink-state-backends/flink-statebackend-rocksdb/src/main/java/org/apache/flink/contrib/streaming/state/restore/RocksDBIncrementalRestoreOperation.java#L197
That's why you found the NoSuchFileException.
It seems I can never get the error handling right when using Akka Streams.
So this is my code
var db = Database.forConfig("oracle")
var mysqlDb = Database.forConfig("mysql_read")
var mysqlDbWrite = Database.forConfig("mysql_write")
implicit val actorSystem = ActorSystem()
val decider : Supervision.Decider = {
case _: Exception =>
println("got an exception restarting connections")
// let us restart our connections
db.close()
mysqlDb.close()
mysqlDbWrite.close()
db = Database.forConfig("oracle")
mysqlDb = Database.forConfig("mysql_read")
mysqlDbWrite = Database.forConfig("mysql_write")
Supervision.Restart
}
implicit val materializer = ActorMaterializer(ActorMaterializerSettings(actorSystem).withSupervisionStrategy(decider))
and I have a flow like this
val alreadyExistsFilter : Flow[Foo, Foo, NotUsed] = Flow[Foo].mapAsync(10){ foo =>
try {
val existsQuery = sql"""SELECT id FROM foo WHERE id = ${foo.id}""".as[Long]
mysqlDbWrite.run(existsQuery).map(v => (foo, v))
} catch {
case e: Throwable =>
println(s"Lookup failed for ${foo}")
throw e // will restart the stream
}
}.collect {case (f, v) if v.isEmpty => f}
So basically if the foo already exists in MySQL then the record should not be processed any further by the stream.
My hope with this code was that if anything fails with the mysql lookup (the mysql machine is pretty bad and timeouts are common), the record will be printed and discarded and the stream will continue with the remaining records courtesy of the supervision.
When I run this code. I see errors like
[error] (mysql_write network timeout executor) java.lang.RuntimeException: java.sql.SQLException: Invalid socket timeout value or state
java.lang.RuntimeException: java.sql.SQLException: Invalid socket timeout value or state
at com.mysql.jdbc.ConnectionImpl$12.run(ConnectionImpl.java:5576)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.sql.SQLException: Invalid socket timeout value or state
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:998)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:937)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:926)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:872)
at com.mysql.jdbc.MysqlIO.setSocketTimeout(MysqlIO.java:4852)
at com.mysql.jdbc.ConnectionImpl$12.run(ConnectionImpl.java:5574)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.SocketException: Socket is closed
at java.net.Socket.setSoTimeout(Socket.java:1137)
at com.mysql.jdbc.MysqlIO.setSocketTimeout(MysqlIO.java:4850)
at com.mysql.jdbc.ConnectionImpl$12.run(ConnectionImpl.java:5574)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
and
[error] (mysql_write network timeout executor) java.lang.NullPointerException
java.lang.NullPointerException
at com.mysql.jdbc.MysqlIO.setSocketTimeout(MysqlIO.java:4850)
at com.mysql.jdbc.ConnectionImpl$12.run(ConnectionImpl.java:5574)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
One thing which surprises me here is that these exceptions don't come from my catch block. because I don't see the println statement of my catch block. The stack trace doesn't show me where it originated from... but since it is saying mysql_write I can assume that its the Flow above because only this Flow uses mysql_write.
Finally the entire stream crashes with the error
[trace] Stack trace suppressed: run last compile:runMain for the full output.
flow has failed with error Shutting down because of violation of the Reactive Streams specification.
14:51:06,973 |-INFO in ch.qos.logback.classic.AsyncAppender[asyncKafkaAppender] - Worker thread will flush remaining events before exiting.
[success] Total time: 3480 s, completed Sep 26, 2017 2:51:07 PM
14:51:07,603 |-INFO in ch.qos.logback.core.hook.DelayingShutdownHook#2320545b - Sleeping for 1 seconds
I don't know what I did to violate the reactive streams specification!!
A first stab at getting a more predictable solution would be removing the blocking behaviour (Await.result) and use mapAsync. A rewrite of the alreadyExistsFilter flow could be:
val alreadyExistsFilter : Flow[Foo, Foo, NotUsed] = Flow[Foo].mapAsync(3) { foo ⇒
val existsQuery = sql"""SELECT id FROM foo WHERE id = ${foo.id}""".as[Long]
foo → Await.result(mysqlDbWrite.run(existsQuery), Duration.Inf)
}.collect{
case (foo, res) if res.isDefined ⇒ foo
}
More info on blocking in Akka can be found in the docs.
The answer given by Stefano is correct. The error was indeed coming because of blocking code in the flow.
Although, my initial program was running against scala 2.11 and even after switching to the mapAsync, the problem persisted.
Since this is a command line tool it was easy for me to switch to scala 2.12 and try again.
When I tried with Scala 2.12 it worked perfectly.
One thing which greatly helped me is to have "ch.qos.logback" % "logback-classic" % "1.2.3", in the dependencies. This will show you each and every SQL statement which is being executed and easily see if something is going wrong.
It's seem that the data in the WindowOperator can serialize successfully. however deserialize failed when the taskmanager restarted by jobmanager.
env: state
backend: hdfs;
jobmanager: high-availability
Root exception:
java.lang.Exception: Could not restore checkpointed state to operators and functions
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreStateLazy(StreamTask.java:414)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:208)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:584)
at java.lang.Thread.run(Thread.java:744)
Caused by: java.io.StreamCorruptedException: invalid type code: 00
at java.io.ObjectInputStream$BlockDataInputStream.readBlockHeader(ObjectInputStream.java:2508)
at java.io.ObjectInputStream$BlockDataInputStream.refill(ObjectInputStream.java:2543)
at java.io.ObjectInputStream$BlockDataInputStream.read(ObjectInputStream.java:2615)
at java.io.DataInputStream.readInt(DataInputStream.java:387)
at java.io.ObjectInputStream$BlockDataInputStream.readInt(ObjectInputStream.java:2820)
at java.io.ObjectInputStream.readInt(ObjectInputStream.java:971)
at java.util.HashMap.readObject(HashMap.java:1158)
at sun.reflect.GeneratedMethodAccessor29.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1893)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:370)
at org.apache.flink.util.InstantiationUtil.deserializeObject(InstantiationUtil.java:294)
at org.apache.flink.streaming.runtime.operators.windowing.WindowOperator$Context.<init>(WindowOperator.java:446)
at org.apache.flink.streaming.runtime.operators.windowing.WindowOperator.restoreState(WindowOperator.java:621)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreStateLazy(StreamTask.java:406)
I have experienced the same issue, when I run multiple flink application with the same custom window/trigger class reused.
(I don't know if it's your case)
But I add generated serialVersionUID in my different reused class and it's work fine now
I am using SymmetricDS and i have stumbled across a problem on one of my clients.
Log says:
2015-11-11 09:10:57,688 ERROR [blagajna_XXX] [RouterService] [blagajna_XXX-job-17] Failed to route and batch data on 'cFpromet' channel
java.lang.NullPointerException
There is no further explanation for null pointer exception, so i can't debug it myself. Replication itself works for some time and then this error appears and replication stops working. Identical system works without any problems.
select * from sym_outgoing_batch where error_flag=1; returns 0 rows, so how can i debug this problem?
GregaJ
EDIT:
java.lang.NullPointerException
at org.jumpmind.db.platform.AbstractJdbcDdlReader.getTableNamePattern(AbstractJdbcDdlReader.java:638)
at org.jumpmind.db.platform.AbstractJdbcDdlReader$3.execute(AbstractJdbcDdlReader.java:574)
at org.jumpmind.db.platform.AbstractJdbcDdlReader$3.execute(AbstractJdbcDdlReader.java:563)
at org.jumpmind.db.sql.JdbcSqlTemplate.execute(JdbcSqlTemplate.java:432)
at org.jumpmind.db.platform.AbstractJdbcDdlReader.readTable(AbstractJdbcDdlReader.java:563)
at org.jumpmind.db.platform.AbstractDatabasePlatform.readTableFromDatabase(AbstractDatabasePlatform.java:239)
at org.jumpmind.db.platform.AbstractDatabasePlatform.getTableFromCache(AbstractDatabasePlatform.java:314)
at org.jumpmind.symmetric.db.AbstractSymmetricDialect.getTable(AbstractSymmetricDialect.java:377)
at org.jumpmind.symmetric.service.impl.RouterService.routeData(RouterService.java:689)
at org.jumpmind.symmetric.service.impl.RouterService.selectDataAndRoute(RouterService.java:634)
at org.jumpmind.symmetric.service.impl.RouterService.routeDataForChannel(RouterService.java:436)
at org.jumpmind.symmetric.service.impl.RouterService.routeDataForEachChannel(RouterService.java:328)
at org.jumpmind.symmetric.service.impl.RouterService.routeData(RouterService.java:175)
at org.jumpmind.symmetric.job.RouterJob.doJob(RouterJob.java:40)
at org.jumpmind.symmetric.job.AbstractJob.invoke(AbstractJob.java:180)
at org.jumpmind.symmetric.job.AbstractJob.run(AbstractJob.java:224)
at org.springframework.scheduling.support.DelegatingErrorHandlingRunnable.run(DelegatingErrorHandlingRunnable.java:54)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)