Flink MySQL connector limit connection - apache-flink

I'm using flink mysql connector with a single executor of 32Gb RAM, 16vCPU with 32 slots. If I run a job with parallelism 32 (job parallelism 224) that is doing temporal lookup joins with 10 MySQL tables, it starts to fail after 2-3 successful runs with below error.
org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:228)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:218)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:209)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:679)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:79)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:444)
at sun.reflect.GeneratedMethodAccessor62.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRpcInvocation$1(AkkaRpcActor.java:316)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:314)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:217)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:78)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:163)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:537)
at akka.actor.Actor.aroundReceive$(Actor.scala:535)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
at akka.actor.ActorCell.invoke(ActorCell.scala:548)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
at akka.dispatch.Mailbox.run(Mailbox.scala:231)
at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)
Caused by: java.lang.IllegalArgumentException: open() failed.
at org.apache.flink.connector.jdbc.table.JdbcRowDataLookupFunction.open(JdbcRowDataLookupFunction.java:138)
at LookupFunction$55178.open(Unknown Source)
at org.apache.flink.api.common.functions.util.FunctionUtils.openFunction(FunctionUtils.java:34)
at org.apache.flink.table.runtime.operators.join.lookup.LookupJoinRunner.open(LookupJoinRunner.java:67)
at org.apache.flink.table.runtime.operators.join.lookup.LookupJoinWithCalcRunner.open(LookupJoinWithCalcRunner.java:51)
at org.apache.flink.api.common.functions.util.FunctionUtils.openFunction(FunctionUtils.java:34)
at org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator.open(AbstractUdfStreamOperator.java:100)
at org.apache.flink.streaming.api.operators.ProcessOperator.open(ProcessOperator.java:56)
at org.apache.flink.streaming.runtime.tasks.RegularOperatorChain.initializeStateAndOpenOperators(RegularOperatorChain.java:110)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:711)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.call(StreamTaskActionExecutor.java:55)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreInternal(StreamTask.java:687)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:654)
at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:958)
at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:927)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:766)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:575)
at java.lang.Thread.run(Thread.java:748)
Caused by: com.mysql.jdbc.exceptions.jdbc4.CommunicationsException: Communications link failure
The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:403)
at com.mysql.jdbc.SQLError.createCommunicationsException(SQLError.java:990)
at com.mysql.jdbc.MysqlIO.<init>(MysqlIO.java:335)
at com.mysql.jdbc.ConnectionImpl.coreConnect(ConnectionImpl.java:2187)
at com.mysql.jdbc.ConnectionImpl.connectOneTryOnly(ConnectionImpl.java:2220)
at com.mysql.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:2015)
at com.mysql.jdbc.ConnectionImpl.<init>(ConnectionImpl.java:768)
at com.mysql.jdbc.JDBC4Connection.<init>(JDBC4Connection.java:47)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:403)
at com.mysql.jdbc.ConnectionImpl.getInstance(ConnectionImpl.java:385)
at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:323)
at org.apache.flink.connector.jdbc.internal.connection.SimpleJdbcConnectionProvider.getOrEstablishConnection(SimpleJdbcConnectionProvider.java:121)
at org.apache.flink.connector.jdbc.table.JdbcRowDataLookupFunction.establishConnectionAndStatement(JdbcRowDataLookupFunction.java:211)
at org.apache.flink.connector.jdbc.table.JdbcRowDataLookupFunction.open(JdbcRowDataLookupFunction.java:129)
... 17 more
Caused by: java.net.SocketException: Too many open files
at java.net.Socket.createImpl(Socket.java:478)
at java.net.Socket.getImpl(Socket.java:538)
at java.net.Socket.setTcpNoDelay(Socket.java:998)
at com.mysql.jdbc.StandardSocketFactory.configureSocket(StandardSocketFactory.java:132)
at com.mysql.jdbc.StandardSocketFactory.connect(StandardSocketFactory.java:203)
at com.mysql.jdbc.MysqlIO.<init>(MysqlIO.java:299)
... 32 more
Did Some debugging, the process list on MySQL shows ~ 2* (total job parallelism) connections, i.e. 448 connections from Task Manager IP. The output of lsof | grep mysql-cj- | wc -l on task manager also reached to 12k from 3k. But after cancelling job, sometime this number doesn't go down. Am I missing something ?

The error is mainly because there are too many connections requesting mysql at the same time. Provide several optimization ideas for reference
Consider reducing the total concurrency of tasks
By default, lookup cache is not enabled. You can enable it by setting both lookup.cache.max-rows and lookup.cache.ttl, refer to https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/connectors/table/jdbc/

Related

FAILED: org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not acquire the minimum required resources

I am running two apache-beam pipelines on the local Flink 1.14 cluster, which raises Could not acquire the minimum required resources. error always a while after it processes all messages from kafka topic, regardless I send there 1000 or 100K messages it's always after there's nothing to read. Also, I noticed that CPU usage of my mac increases heavily after there's nothing to read from kafka topics.
I tripled jobmanager and taskmanager memory, reduced parallelism, and changed heap size, but none of the helped.
This is my flink-conf.yaml:
jobmanager.rpc.address: localhost
jobmanager.rpc.port: 6123
jobmanager.memory.process.size: 4800m
taskmanager.memory.process.size: 5184m
taskmanager.memory.managed.size: 0
taskmanager.numberOfTaskSlots: 2
parallelism.default: 2
jobmanager.execution.failover-strategy: region
restart-strategy: failure-rate
Flink-standalonesession.local.log
2023-01-24 11:28:33,111 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - [3]Read topic - 1/{KafkaIO.Read, Remove Kafka Metadata} -> [10]{Deserialize without magic byte, Contact Look-up , Add prefix, User look-up, Lead look-up, Product Usage User look-up, Deletion look-up, Apply final where condition, Apply final transformation, Dic2String - Leading topic} -> [2]Write to CONTACTS_OUT topic/{Kafka ProducerRecord, KafkaIO.WriteRecords} (1/1) (7b82955914db7b35635f0c7a0e3793d6) switched from RUNNING to FAILED on 127.0.0.1:53351-0e899a # localhost (dataPort=53353).
java.util.concurrent.TimeoutException: Heartbeat of TaskManager with id 127.0.0.1:53351-0e899a timed out.
at org.apache.flink.runtime.jobmaster.JobMaster$TaskManagerHeartbeatListener.notifyHeartbeatTimeout(JobMaster.java:1343) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.heartbeat.HeartbeatMonitorImpl.run(HeartbeatMonitorImpl.java:155) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) ~[?:?]
at java.util.concurrent.FutureTask.run(FutureTask.java:264) ~[?:?]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRunAsync$4(AkkaRpcActor.java:455) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:455) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:213) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:78) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:163) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.Actor.aroundReceive(Actor.scala:537) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.Actor.aroundReceive$(Actor.scala:535) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.ActorCell.invoke(ActorCell.scala:548) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.dispatch.Mailbox.run(Mailbox.scala:231) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.dispatch.Mailbox.exec(Mailbox.scala:243) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) [?:?]
at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) [?:?]
at java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) [?:?]
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) [?:?]
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) [?:?]
.
.
.
2023-01-24 11:28:36,807 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Impulse -> [3]Read topic - Lead/KafkaIO.Read/KafkaIO.Read.ReadFromKafkaViaUnbounded/Read(KafkaUnboundedSource)/{ParDo(OutputSingleSource), ParDo(UnboundedSourceAsSDFWrapper)} (1/1) (fa81660c9e8094977d150182626af918) switched from SCHEDULED to FAILED on [unassigned resource].
org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not acquire the minimum required resources.
2023-01-24 11:28:36,808 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: Impulse -> [3]Read topic - 1/KafkaIO.Read/KafkaIO.Read.ReadFromKafkaViaUnbounded/Read(KafkaUnboundedSource)/{ParDo(OutputSingleSource), ParDo(UnboundedSourceAsSDFWrapper)} (1/1) (22d6e55a565dd19086e372c058a65b72) switched from SCHEDULED to FAILED on [unassigned resource].
org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not acquire the minimum required resources.
2023-01-24 11:28:36,810 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Discarding the results produced by task execution 22d6e55a565dd19086e372c058a65b72.
2023-01-24 11:28:36,811 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Discarding the results produced by task execution fa81660c9e8094977d150182626af918.
2023-01-24 11:28:36,812 INFO org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy [] - Calculating tasks to restart to recover the failed task cbc357ccb763df2852fee8c4fc7d55f2_0.
2023-01-24 11:28:36,813 INFO org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy [] - 2 tasks should be restarted to recover the failed task cbc357ccb763df2852fee8c4fc7d55f2_0.
2023-01-24 11:28:36,814 INFO org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy [] - Calculating tasks to restart to recover the failed task cbc357ccb763df2852fee8c4fc7d55f2_0.
2023-01-24 11:28:36,814 INFO org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy [] - 2 tasks should be restarted to recover the failed task cbc357ccb763df2852fee8c4fc7d55f2_0.
2023-01-24 11:28:36,816 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job update-contact (71870fb09db1c5d57046edd8d020971b) switched from state RUNNING to FAILING.
org.apache.flink.runtime.JobException: Recovery is suppressed by FailureRateRestartBackoffTimeStrategy(FailureRateRestartBackoffTimeStrategy(failuresIntervalMS=60000,backoffTimeMS=1000,maxFailuresPerInterval=1)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:228) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:218) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:209) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:679) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.UpdateSchedulerNgOnInternalFailuresListener.notifyTaskFailure(UpdateSchedulerNgOnInternalFailuresListener.java:51) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.notifySchedulerNgAboutInternalTaskFailure(DefaultExecutionGraph.java:1473) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.executiongraph.Execution.processFail(Execution.java:1133) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.executiongraph.Execution.processFail(Execution.java:1073) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.executiongraph.Execution.markFailed(Execution.java:912) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.executiongraph.ExecutionVertex.markFailed(ExecutionVertex.java:474) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.DefaultExecutionVertexOperations.markFailed(DefaultExecutionVertexOperations.java:41) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskDeploymentFailure(DefaultScheduler.java:562) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignAllResourcesAndRegisterProducedPartitions$6(DefaultScheduler.java:457) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:930) ~[?:?]
at java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:907) ~[?:?]
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) ~[?:?]
at java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088) ~[?:?]
at org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequest.failRequest(DeclarativeSlotPoolBridge.java:545) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.cancelPendingRequests(DeclarativeSlotPoolBridge.java:127) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.failPendingRequests(DeclarativeSlotPoolBridge.java:355) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.notifyNotEnoughResourcesAvailable(DeclarativeSlotPoolBridge.java:344) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at org.apache.flink.runtime.jobmaster.JobMaster.notifyNotEnoughResourcesAvailable(JobMaster.java:809) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:?]
at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:?]
at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:?]
at java.lang.reflect.Method.invoke(Method.java:566) ~[?:?]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRpcInvocation$0(AkkaRpcActor.java:308) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:307) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:217) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:78) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:163) ~[flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.Actor.aroundReceive(Actor.scala:537) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.Actor.aroundReceive$(Actor.scala:535) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.actor.ActorCell.invoke(ActorCell.scala:548) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.dispatch.Mailbox.run(Mailbox.scala:231) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at akka.dispatch.Mailbox.exec(Mailbox.scala:243) [flink-rpc-akka_1130a6a2-bae5-4fe6-a274-c23dbe38ebac.jar:1.14.0]
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) [?:?]
at java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) [?:?]
at java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) [?:?]
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) [?:?]
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) [?:?]
Caused by: java.util.concurrent.CompletionException: java.util.concurrent.CompletionException: org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not acquire the minimum required resources.
at org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResource$8(DefaultScheduler.java:515) ~[flink-dist_2.11-1.14.0.jar:1.14.0]
... 40 more
Caused by: java.util.concurrent.CompletionException: org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not acquire the minimum required resources.
at java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331) ~[?:?]
at java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346) ~[?:?]
at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632) ~[?:?]
... 38 more
Caused by: org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not acquire the minimum required resources.
flink-taskexecurtor.local.log
2023-01-24 11:27:51,604 WARN org.apache.beam.fn.harness.control.ExecutionStateSampler [] - Operation ongoing in bundle 2672 for at least 09m21s without outputting or completing:
at java.base#11.0.16/jdk.internal.misc.Unsafe.park(Native Method)
at java.base#11.0.16/java.util.concurrent.locks.LockSupport.park(LockSupport.java:194)
at java.base#11.0.16/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2081)
at app//org.apache.beam.sdk.fn.CancellableQueue.take(CancellableQueue.java:94)
at app//org.apache.beam.sdk.fn.data.BeamFnDataInboundObserver2.awaitCompletion(BeamFnDataInboundObserver2.java:122)
at app//org.apache.beam.fn.harness.control.ProcessBundleHandler.processBundle(ProcessBundleHandler.java:546)
at app//org.apache.beam.fn.harness.FnHarness$$Lambda$170/0x00000008402b8040.apply(Unknown Source)
at app//org.apache.beam.fn.harness.control.BeamFnControlClient.delegateOnInstructionRequestType(BeamFnControlClient.java:151)
at app//org.apache.beam.fn.harness.control.BeamFnControlClient$InboundObserver.lambda$onNext$0(BeamFnControlClient.java:116)
at app//org.apache.beam.fn.harness.control.BeamFnControlClient$InboundObserver$$Lambda$177/0x00000008402bbc40.run(Unknown Source)
at java.base#11.0.16/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base#11.0.16/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at app//org.apache.beam.sdk.util.UnboundedScheduledExecutorService$ScheduledFutureTask.run(UnboundedScheduledExecutorService.java:162)
at java.base#11.0.16/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base#11.0.16/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base#11.0.16/java.lang.Thread.run(Thread.java:829)
2023-01-24 11:29:53,564 INFO org.apache.kafka.clients.consumer.internals.SubscriptionState [] - [Consumer clientId=consumer-Reader-0_offset_consumer_1403650759_none-3, groupId=Reader-0_offset_consumer_1403650759_none] Resetting offset for partition D_CONTACT-0 to offset 10007.
2023-01-24 11:29:57,167 ERROR org.apache.flink.runtime.util.ClusterUncaughtExceptionHandler [] - WARNING: Thread 'grpc-nio-worker-ELG-3-4' produced an uncaught exception. If you want to fail on uncaught exceptions, then configure cluster.uncaught-exception-handling accordingly
2023-01-24 11:29:57,176 ERROR org.apache.flink.runtime.util.ClusterUncaughtExceptionHandler [] - WARNING: Thread 'grpc-nio-worker-ELG-3-3' produced an uncaught exception. If you want to fail on uncaught exceptions, then configure cluster.uncaught-exception-handling accordingly
2023-01-24 11:29:34,305 WARN org.jboss.netty.channel.socket.nio.AbstractNioSelector [] - Unexpected exception in the selector loop.
java.lang.OutOfMemoryError: Java heap space
2023-01-24 11:29:34,306 ERROR org.apache.flink.util.FatalExitExceptionHandler [] - FATAL: Thread 'flink-7' produced an uncaught exception. Stopping the process...
java.lang.OutOfMemoryError: Java heap space
2023-01-24 11:29:49,956 ERROR org.apache.flink.util.FatalExitExceptionHandler [] - FATAL: Thread 'flink-scheduler-1' produced an uncaught exception. Stopping the process...
java.lang.OutOfMemoryError: Java heap space
2023-01-24 11:29:34,306 ERROR org.apache.flink.util.FatalExitExceptionHandler [] - FATAL: Thread 'flink-metrics-12' produced an uncaught exception. Stopping the process...
java.lang.OutOfMemoryError: Java heap space
2023-01-24 11:29:49,958 ERROR org.apache.flink.util.FatalExitExceptionHandler [] - FATAL: Thread 'Time Trigger for [3]Read topic - Lead/{KafkaIO.Read, Remove Kafka Metadata} -> [3]{Deserialize - Contact, Update Contact state stores, Dic2String - c topic} -> [2]Write to c topic/{Kafka ProducerRecord, KafkaIO.WriteRecords} (1/1)#0' produced an uncaught exception. Stopping the process...
java.lang.OutOfMemoryError: Java heap space
2023-01-24 11:30:03,789 INFO org.apache.flink.runtime.blob.PermanentBlobCache [] - Shutting down BLOB cache
2023-01-24 11:30:06,226 INFO org.apache.flink.runtime.blob.TransientBlobCache [] - Shutting down BLOB cache
2023-01-24 11:30:10,098 INFO org.apache.flink.runtime.state.TaskExecutorStateChangelogStoragesManager [] - Shutting down TaskExecutorStateChangelogStoragesManager.
2023-01-24 11:30:10,098 INFO org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager [] - Shutting down TaskExecutorLocalStateStoresManager.

Getting Too many Open Files Error in logs

Recently we got "(Too many open files)" error in production and this leads to very high CPU spike(98%) on Linux machine and finally brought machine down. We have to reboot the machine to bring it back.
Flow of code is like this :-
Consume message from one of IBM-MQ queue -> start processing it -> make some update entries into SQL-Server DB and commit the transaction.
We are using Hikari for connection pool management and value is set as 30
hikariConfig.setMaximumPoolSize(30);
While checking the logs, i found following stack trace :-
2021-03-24T20:00:22.404 WARNING org.apache.catalina.loader.WebappClassLoaderBase Failed to open JAR [null]
java.io.FileNotFoundException: /opt/apache-tomcat-7.0.88/webapps/ROOT/WEB-INF/lib/HikariCP-2.5.1.jar (Too many open files)
at java.util.zip.ZipFile.open(Native Method)
at java.util.zip.ZipFile.<init>(ZipFile.java:219)
at java.util.zip.ZipFile.<init>(ZipFile.java:149)
at java.util.jar.JarFile.<init>(JarFile.java:166)
at java.util.jar.JarFile.<init>(JarFile.java:130)
at org.apache.tomcat.util.compat.JreCompat.jarFileNewInstance(JreCompat.java:221)
at org.apache.catalina.loader.WebappClassLoaderBase.openJARs(WebappClassLoaderBase.java:3102)
at org.apache.catalina.loader.WebappClassLoaderBase.findResourceInternal(WebappClassLoaderBase.java:3414)
at org.apache.catalina.loader.WebappClassLoaderBase.findResource(WebappClassLoaderBase.java:1494)
at org.apache.catalina.loader.WebappClassLoaderBase.getResourceAsStream(WebappClassLoaderBase.java:1722)
at java.util.ResourceBundle$Control$1.run(ResourceBundle.java:2677)
at java.util.ResourceBundle$Control$1.run(ResourceBundle.java:2662)
at java.security.AccessController.doPrivileged(Native Method)
at java.util.ResourceBundle$Control.newBundle(ResourceBundle.java:2661)
at java.util.ResourceBundle.loadBundle(ResourceBundle.java:1501)
at java.util.ResourceBundle.findBundle(ResourceBundle.java:1465)
at java.util.ResourceBundle.getBundleImpl(ResourceBundle.java:1361)
at java.util.ResourceBundle.getBundle(ResourceBundle.java:773)
at com.ibm.mq.jmqi.JmqiException.buildMessage(JmqiException.java:428)
at com.ibm.mq.jmqi.JmqiException.getMessage(JmqiException.java:543)
at com.ibm.mq.jmqi.JmqiException.getMessage(JmqiException.java:515)
at java.lang.Throwable.getLocalizedMessage(Throwable.java:391)
at com.ibm.mq.jmqi.internal.JmqiTools.getExSumm(JmqiTools.java:947)
at com.ibm.mq.jmqi.remote.api.RemoteFAP.jmqiConnect(RemoteFAP.java:2282)
at com.ibm.mq.jmqi.remote.api.RemoteFAP.jmqiConnect(RemoteFAP.java:1294)
at com.ibm.msg.client.wmq.internal.WMQSession.connect(WMQSession.java:387)
at com.ibm.msg.client.wmq.internal.WMQSession.<init>(WMQSession.java:331)
at com.ibm.msg.client.wmq.internal.WMQConnection.createSession(WMQConnection.java:909)
at com.ibm.msg.client.jms.internal.JmsConnectionImpl.createSession(JmsConnectionImpl.java:896)
at com.ibm.mq.jms.MQConnection.createSession(MQConnection.java:337)
at org.springframework.jms.connection.SingleConnectionFactory.createSession(SingleConnectionFactory.java:437)
at org.springframework.jms.connection.CachingConnectionFactory.getSession(CachingConnectionFactory.java:236)
at org.springframework.jms.connection.SingleConnectionFactory$SharedConnectionInvocationHandler.invoke(SingleConnectionFactory.java:604)
at com.sun.proxy.$Proxy195.createSession(Unknown Source)
at org.springframework.jms.support.JmsAccessor.createSession(JmsAccessor.java:192)
at org.springframework.jms.core.JmsTemplate.execute(JmsTemplate.java:475)
at org.springframework.jms.core.JmsTemplate.execute(JmsTemplate.java:526)
at com.test.config.messaging.MainframeMessageProducer.lambda$sendMessageToQueueWithHeaders$0(MainframeMessageProducer.java:120)
at org.springframework.retry.support.RetryTemplate.doExecute(RetryTemplate.java:287)
at org.springframework.retry.support.RetryTemplate.execute(RetryTemplate.java:164)
at com.test.config.messaging.MainframeMessageProducer.sendMessageToQueueWithHeaders(MainframeMessageProducer.java:118)
at com.test.config.messaging.MainframeMessageProducer.sendMessageToMainframeQueue(MainframeMessageProducer.java:101)
at com.test.config.messaging.MainframeMessageProducer$$FastClassBySpringCGLIB$$f2d092d2.invoke(<generated>)
at org.springframework.cglib.proxy.MethodProxy.invoke(MethodProxy.java:204)
at org.springframework.aop.framework.CglibAopProxy$CglibMethodInvocation.invokeJoinpoint(CglibAopProxy.java:721)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:157)
at org.springframework.aop.aspectj.MethodInvocationProceedingJoinPoint.proceed(MethodInvocationProceedingJoinPoint.java:85)
at com.test.common.aop.RecordTimingAspect.recordTimingAspect(RecordTimingAspect.java:28)
at sun.reflect.GeneratedMethodAccessor505.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethodWithGivenArgs(AbstractAspectJAdvice.java:629)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethod(AbstractAspectJAdvice.java:618)
at org.springframework.aop.aspectj.AspectJAroundAdvice.invoke(AspectJAroundAdvice.java:70)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:168)
at org.springframework.aop.interceptor.ExposeInvocationInterceptor.invoke(ExposeInvocationInterceptor.java:92)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:179)
at org.springframework.aop.framework.CglibAopProxy$DynamicAdvisedInterceptor.intercept(CglibAopProxy.java:656)
at com.test.config.messaging.MainframeMessageProducer$$EnhancerBySpringCGLIB$$dc5ae691.sendMessageToMainframeQueue(<generated>)
at com.test.posting.GDISPurchasePostServiceImpl.postMessage(GDISPurchasePostServiceImpl.java:315)
at com.test.posting.GDISPostingService.processMessage(GDISPostingService.java:56)
at com.test.posting.GDISPostingService$$FastClassBySpringCGLIB$$3c0c40af.invoke(<generated>)
at org.springframework.cglib.proxy.MethodProxy.invoke(MethodProxy.java:204)
at org.springframework.aop.framework.CglibAopProxy$CglibMethodInvocation.invokeJoinpoint(CglibAopProxy.java:721)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:157)
at org.springframework.aop.aspectj.MethodInvocationProceedingJoinPoint.proceed(MethodInvocationProceedingJoinPoint.java:85)
at com.test.common.aop.CorrelationIdAspect.correlationIdAdvice(CorrelationIdAspect.java:40)
at sun.reflect.GeneratedMethodAccessor554.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethodWithGivenArgs(AbstractAspectJAdvice.java:629)
at org.springframework.aop.aspectj.AbstractAspectJAdvice.invokeAdviceMethod(AbstractAspectJAdvice.java:618)
at org.springframework.aop.aspectj.AspectJAroundAdvice.invoke(AspectJAroundAdvice.java:70)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:168)
at org.springframework.aop.interceptor.ExposeInvocationInterceptor.invoke(ExposeInvocationInterceptor.java:92)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:179)
at org.springframework.aop.framework.CglibAopProxy$DynamicAdvisedInterceptor.intercept(CglibAopProxy.java:656)
at com.test.posting.GDISPostingService$$EnhancerBySpringCGLIB$$dd870248.processMessage(<generated>)
at com.test.posting.GDISPostingMessageListener.onMessage(GDISPostingMessageListener.java:167)
at org.springframework.jms.listener.AbstractMessageListenerContainer.doInvokeListener(AbstractMessageListenerContainer.java:746)
at org.springframework.jms.listener.AbstractMessageListenerContainer.invokeListener(AbstractMessageListenerContainer.java:684)
at org.springframework.jms.listener.AbstractMessageListenerContainer.doExecuteListener(AbstractMessageListenerContainer.java:651)
at org.springframework.jms.listener.AbstractPollingMessageListenerContainer.doReceiveAndExecute(AbstractPollingMessageListenerContainer.java:317)
at org.springframework.jms.listener.AbstractPollingMessageListenerContainer.receiveAndExecute(AbstractPollingMessageListenerContainer.java:255)
at org.springframework.jms.listener.DefaultMessageListenerContainer$AsyncMessageListenerInvoker.invokeListener(DefaultMessageListenerContainer.java:1166)
at org.springframework.jms.listener.DefaultMessageListenerContainer$AsyncMessageListenerInvoker.executeOngoingLoop(DefaultMessageListenerContainer.java:1158)
at org.springframework.jms.listener.DefaultMessageListenerContainer$AsyncMessageListenerInvoker.run(DefaultMessageListenerContainer.java:1055)
at java.lang.Thread.run(Thread.java:748)
I am confused here what leads to max open files error here :-
is it due to Hikari CP as this is first line we got as an error (java.io.FileNotFoundException: /opt/apache-tomcat-7.0.88/webapps/ROOT/WEB-INF/lib/HikariCP-2.5.1.jar (Too many open files) ) ?
Is it due to IBM-MQ connection not available ?
You could use the lsof command to LiSt Open Files.
It produces lots of output, so you'll want to capture and post process it ( eg grep or sort)
You can list open files by pid, userid, disk etc.
You might try lsof > before wait... lsof > after then use diff to see the differences

Flink program cannot submit when i follow flink-1.4's quickstart and use "./bin/flink run examples/streaming/SocketWindowWordCount.jar --port 9000"

Flink-1.4 quickstart address: https://ci.apache.org/projects/flink/flink-docs-release-1.4/quickstart/setup_quickstart.html.
When I use "./bin/start-local.sh" to start flink following flink-1.4's quickstart, then i check http://localhost:8081/ and make sure everything is running, then i use "./bin/flink run examples/streaming/SocketWindowWordCount.jar --port 9000" to submit .jar and i got following info, and i can't submit successfully.
------------------------------------------------------------
The program finished with the following exception:
org.apache.flink.client.program.ProgramInvocationException: The program execution failed: Couldn't retrieve the JobExecutionResult from the JobManager.
at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:492)
at org.apache.flink.client.program.StandaloneClusterClient.submitJob(StandaloneClusterClient.java:105)
at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:456)
at org.apache.flink.streaming.api.environment.StreamContextEnvironment.execute(StreamContextEnvironment.java:66)
at org.apache.flink.streaming.examples.socket.SocketWindowWordCount.main(SocketWindowWordCount.java:92)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:525)
at org.apache.flink.client.program.PackagedProgram.invokeInteractiveModeForExecution(PackagedProgram.java:417)
at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:396)
at org.apache.flink.client.CliFrontend.executeProgram(CliFrontend.java:802)
at org.apache.flink.client.CliFrontend.run(CliFrontend.java:282)
at org.apache.flink.client.CliFrontend.parseParameters(CliFrontend.java:1054)
at org.apache.flink.client.CliFrontend$1.call(CliFrontend.java:1101)
at org.apache.flink.client.CliFrontend$1.call(CliFrontend.java:1098)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1556)
at org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
at org.apache.flink.client.CliFrontend.main(CliFrontend.java:1098)
Caused by: org.apache.flink.runtime.client.JobExecutionException: Couldn't retrieve the JobExecutionResult from the JobManager.
at org.apache.flink.runtime.client.JobClient.awaitJobResult(JobClient.java:300)
at org.apache.flink.runtime.client.JobClient.submitJobAndWait(JobClient.java:387)
at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:481)
... 21 more
Caused by: org.apache.flink.runtime.client.JobClientActorConnectionTimeoutException: Lost connection to the JobManager.
at org.apache.flink.runtime.client.JobClientActor.handleMessage(JobClientActor.java:219)
at org.apache.flink.runtime.akka.FlinkUntypedActor.handleLeaderSessionID(FlinkUntypedActor.java:104)
at org.apache.flink.runtime.akka.FlinkUntypedActor.onReceive(FlinkUntypedActor.java:71)
at akka.actor.UntypedActor$$anonfun$receive$1.applyOrElse(UntypedActor.scala:165)
at akka.actor.Actor$class.aroundReceive(Actor.scala:502)
at akka.actor.UntypedActor.aroundReceive(UntypedActor.scala:95)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:526)
at akka.actor.ActorCell.invoke(ActorCell.scala:495)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:257)
at akka.dispatch.Mailbox.run(Mailbox.scala:224)
at akka.dispatch.Mailbox.exec(Mailbox.scala:234)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
I have tried flink-1.3 with the same install steps,i can submit .jar and run correctly, but flink-1.4 can not work, anyone knows what's wrong?
In addition,when I use "./bin/start-local.sh" to start flink-1.4.0, i check "http://localhost:8081/" and everything is running, but jobmanager logs show errors, the log file is as follow:
2018-01-26 14:55:01,012 INFO org.apache.flink.runtime.clusterframework.standalone.StandaloneResourceManager - TaskManager 3b3e2dd7f3f29662b6312e875dd496dc has started.
2018-01-26 14:55:01,053 WARN akka.remote.ReliableDeliverySupervisor - Association with remote system [akka.tcp://flink#pyn-virtual-machine:44785] has failed, address is now gated for [5000] ms. Reason: [[B cannot be cast to [C]
2018-01-26 14:55:10,589 ERROR akka.remote.Remoting - [B cannot be cast to [C
java.lang.ClassCastException: [B cannot be cast to [C
at akka.remote.artery.FastHash$.ofString(LruBoundedCache.scala:18)
at akka.remote.serialization.ActorRefResolveCache.hash(ActorRefResolveCache.scala:61)
at akka.remote.serialization.ActorRefResolveCache.hash(ActorRefResolveCache.scala:55)
at akka.remote.artery.LruBoundedCache.getOrCompute(LruBoundedCache.scala:110)
at akka.remote.RemoteActorRefProvider.resolveActorRef(RemoteActorRefProvider.scala:403)
at akka.actor.SerializedActorRef.readResolve(ActorRef.scala:433)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at java.base/java.io.ObjectStreamClass.invokeReadResolve(ObjectStreamClass.java:1250)
at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2087)
at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1585)
at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2346)
at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2240)
at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2078)
at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1585)
at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
at akka.serialization.JavaSerializer$$anonfun$1.apply(Serializer.scala:328)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at akka.serialization.JavaSerializer.fromBinary(Serializer.scala:328)
at akka.serialization.Serialization.akka$serialization$Serialization$$deserializeByteArray(Serialization.scala:156)
at akka.serialization.Serialization$$anonfun$deserialize$2.apply(Serialization.scala:142)
at scala.util.Try$.apply(Try.scala:192)
at akka.serialization.Serialization.deserialize(Serialization.scala:136)
at akka.remote.MessageSerializer$.deserialize(MessageSerializer.scala:30)
at akka.remote.DefaultMessageDispatcher.payload$lzycompute$1(Endpoint.scala:64)
at akka.remote.DefaultMessageDispatcher.payload$1(Endpoint.scala:64)
at akka.remote.DefaultMessageDispatcher.dispatch(Endpoint.scala:82)
at akka.remote.EndpointReader$$anonfun$akka$remote$EndpointReader$$deliverAndAck$1.apply(Endpoint.scala:1047)
at akka.remote.EndpointReader$$anonfun$akka$remote$EndpointReader$$deliverAndAck$1.apply(Endpoint.scala:1046)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at akka.remote.EndpointReader.akka$remote$EndpointReader$$deliverAndAck(Endpoint.scala:1046)
at akka.remote.EndpointReader$$anonfun$receive$2.applyOrElse(Endpoint.scala:980)
at akka.actor.Actor$class.aroundReceive(Actor.scala:502)
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:446)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:526)
at akka.actor.ActorCell.invoke(ActorCell.scala:495)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:257)
at akka.dispatch.Mailbox.run(Mailbox.scala:224)
at akka.dispatch.Mailbox.exec(Mailbox.scala:234)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
2018-01-26 14:55:19,206 WARN akka.remote.ReliableDeliverySupervisor - Association with remote system [akka.tcp://flink#pyn-virtual-machine:44785] has failed, address is now gated for [5000] ms. Reason: [Disassociated]
2018-01-26 14:56:04,754 WARN akka.remote.ReliableDeliverySupervisor - Association with remote system [akka.tcp://flink#pyn-virtual-machine:37261] has failed, address is now gated for [5000] ms. Reason: [Disassociated]

how to run terasort with flink?

i just configure my flink cluster (1.0.3 version) and i whanted to run tersort with the flink framework, but its me returned the next erreor, i used this comamnde to run my programm:
/bin/flink run -c --class es.udc.gac.flinkbench.ScalaTeraSort flinkbench-assembly-1.0.jar hdfs://192.168.3.89:8020/teragen/10mo hdfs://192.168.3.89:8020/teragen/rstflink
and its me return this :
Cluster configuration: Standalone cluster with JobManager at localhost/127.0.0.1:6123
Using address localhost:6123 to connect to JobManager.
JobManager web interface address http://localhost:8081
Starting execution of program
2017-06-23 10:28:43,692 INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 2
Spent 2278ms computing base-splits.
Spent 2ms computing TeraScheduler splits.
Computing input splits took 2281ms
Sampling 2 splits of 2
Making -1 from 100000 sampled records
------------------------------------------------------------
The program finished with the following exception:
org.apache.flink.client.program.ProgramInvocationException: The main method caused an error.
at org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:545)
at org.apache.flink.client.program.PackagedProgram.invokeInteractiveModeForExecution(PackagedProgram.java:419)
at org.apache.flink.client.program.ClusterClient.run(ClusterClient.java:339)
at org.apache.flink.client.CliFrontend.executeProgram(CliFrontend.java:831)
at org.apache.flink.client.CliFrontend.run(CliFrontend.java:256)
at org.apache.flink.client.CliFrontend.parseParameters(CliFrontend.java:1073)
at org.apache.flink.client.CliFrontend$2.call(CliFrontend.java:1120)
at org.apache.flink.client.CliFrontend$2.call(CliFrontend.java:1117)
at org.apache.flink.runtime.security.HadoopSecurityContext$1.run(HadoopSecurityContext.java:43)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1656)
at org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:40)
at org.apache.flink.client.CliFrontend.main(CliFrontend.java:1116)
Caused by: java.lang.NegativeArraySizeException
at es.udc.gac.flinkbench.terasort.TeraInputFormat$TextSampler.createPartitions(TeraInputFormat.java:103)
at es.udc.gac.flinkbench.terasort.TeraInputFormat.writePartitionFile(TeraInputFormat.java:183)
at es.udc.gac.flinkbench.ScalaTeraSort$.main(ScalaTeraSort.scala:49)
at es.udc.gac.flinkbench.ScalaTeraSort.main(ScalaTeraSort.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:528)
... 13 more

SoLR performance issue

I am working on solr 4.2.1 jetty and we are facing some performance issues and heap memory overflow issue as well.
So I am searching the actual cause for this exceptions. Then i applied load test for different solr queries. After few mins got below errors:
WARN:oejs.Response:Committed before 500 {msg=Software caused
connection abort: socket write
Caused by: java.net.SocketException: Software caused connection abort:
socket write error
SEVERE: null:org.eclipse.jetty.io.EofException
I also tried to set the maxIdleTime to 300000 milliSeconds. But still getting same error.
Any ideas?
Please help, how to tackle this.
Thanks,
Mayur
Stack Trace:
SEVERE: null:org.eclipse.jetty.io.EofException
at org.eclipse.jetty.http.HttpGenerator.flushBuffer(HttpGenerator.java:914)
at org.eclipse.jetty.http.AbstractGenerator.blockForOutput(AbstractGenerator.java:507)
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:147)
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:107)
at sun.nio.cs.StreamEncoder.writeBytes(Unknown Source)
at sun.nio.cs.StreamEncoder.implWrite(Unknown Source)
at sun.nio.cs.StreamEncoder.write(Unknown Source)
at java.io.OutputStreamWriter.write(Unknown Source)
at org.apache.solr.util.FastWriter.flush(FastWriter.java:141)
at org.apache.solr.util.FastWriter.write(FastWriter.java:55)
at org.apache.solr.response.XMLWriter.writePrim(XMLWriter.java:356)
at org.apache.solr.response.XMLWriter.writeStr(XMLWriter.java:295)
at org.apache.solr.schema.StrField.write(StrField.java:67)
at org.apache.solr.response.TextResponseWriter.writeVal(TextResponseWriter.java:130)
at org.apache.solr.response.XMLWriter.writeSolrDocument(XMLWriter.java:199)
at org.apache.solr.response.TextResponseWriter.writeDocuments(TextResponseWriter.java:275)
at org.apache.solr.response.TextResponseWriter.writeVal(TextResponseWriter.java:172)
at org.apache.solr.response.XMLWriter.writeResponse(XMLWriter.java:111)
at org.apache.solr.response.XMLResponseWriter.write(XMLResponseWriter.java:39)
at org.apache.solr.servlet.SolrDispatchFilter.writeResponse(SolrDispatchFilter.java:627)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:358)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
at org.eclipse.jetty.server.Server.handle(Server.java:365)
at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
at org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:926)
at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:988)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:635)
at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:235)
at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
at java.lang.Thread.run(Unknown Source) Caused by: java.net.SocketException: Software caused connection abort: socket
write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(Unknown Source)
at java.net.SocketOutputStream.write(Unknown Source)
at org.eclipse.jetty.io.ByteArrayBuffer.writeTo(ByteArrayBuffer.java:359)
at org.eclipse.jetty.io.bio.StreamEndPoint.flush(StreamEndPoint.java:164)
at org.eclipse.jetty.io.bio.StreamEndPoint.flush(StreamEndPoint.java:194)
at org.eclipse.jetty.http.HttpGenerator.flushBuffer(HttpGenerator.java:838)
... 46 more

Resources