pyflink configuration error SQL parse fail - apache-flink

I want to be sure that flink works in terms of setting and then try to complicate the usage. In the most simple example I tryed to do this stuff.
The input contains
column_a,column_b
1,2
The output exists.
In order to download the pyflink version of 1.10 using docker for my app I use the following snippet of code :
jobmanager:
image: pyflink/playgrounds:1.10.0
volumes:
- ./examples:/opt/examples
hostname: "jobmanager"
expose:
- "6123"
ports:
- "8088:8088"
command: jobmanager
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
taskmanager:
image: pyflink/playgrounds:1.10.0
volumes:
- ./examples:/opt/examples
expose:
- "6121"
- "6122"
depends_on:
- jobmanager
command: taskmanager
links:
- jobmanager:jobmanager
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
taskmanager.numberOfTaskSlots: 2
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import StreamingFileSink
from pyflink.table import EnvironmentSettings, StreamTableEnvironment, BatchTableEnvironment
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.expressions import lit
import pandas as pd
from inspect import getmembers, isfunction
import os
# create a blink batch TableEnvironment
env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
table_env = BatchTableEnvironment.create(environment_settings=env_settings)
source_ddl = """
CREATE TABLE MyUserTable (
column_a INT PRIMARY KEY,
column_b INT,
) WITH (
'connector' = 'filesystem',
'path' = 'file:///Users//code/examples/input.csv ',
'format' = 'csv'
)"""
#connector for data output/sink
sink_ddl = """
CREATE TABLE table2 (
score INT)
WITH (
'connector' = 'filesystem',
'path' = 'file:///Users//code/examples/output.csv',
'format' = 'csv'
)"""
#make the table corresponding to the schema mentioned
source_table = table_env.execute_sql(source_ddl)
sink_table = table_env.execute_sql(sink_ddl)
#convert the sql table to table API
table_path = table_env.from_path("MyUserTable")
# execute SELECT statement
table_result2 = table_env.execute_sql("SELECT * FROM MyUserTable")
table_result2.print()
The error generated is the following
File ".\flink1.py", line 41, in <module>
source_table = table_env.execute_sql(source_ddl)
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\table\table_environment.py", line 804, in execute_sql
return TableResult(self._j_tenv.executeSql(stmt))
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\py4j\java_gateway.py", line 1285, in __call__
return_value = get_return_value(
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\util\exceptions.py",
line 147, in deco
return f(*a, **kw)
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\py4j\protocol.py", line 326,
in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o10.executeSql.
: org.apache.flink.table.api.SqlParserException: SQL parse failed. Encountered ")" at line 6, column 25.
Was expecting one of:
"CONSTRAINT" ...
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<HYPHENATED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
at org.apache.flink.table.planner.parse.CalciteParser.parse(CalciteParser.java:56)
at org.apache.flink.table.planner.delegation.ParserImpl.parse(ParserImpl.java:96)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeSql(TableEnvironmentImpl.java:722) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.calcite.sql.parser.SqlParseException: Encountered ")" at line 6, column 25.
Was expecting one of:
"CONSTRAINT" ...
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<HYPHENATED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.convertException(FlinkSqlParserImpl.java:450)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.normalizeException(FlinkSqlParserImpl.java:213)
at org.apache.calcite.sql.parser.SqlParser.handleException(SqlParser.java:140)
at org.apache.calcite.sql.parser.SqlParser.parseQuery(SqlParser.java:155)
at org.apache.calcite.sql.parser.SqlParser.parseStmt(SqlParser.java:180)
at org.apache.flink.table.planner.parse.CalciteParser.parse(CalciteParser.java:54)
... 13 more
Caused by: org.apache.flink.sql.parser.impl.ParseException: Encountered ")" at line 6, column 25.
Was expecting one of:
"CONSTRAINT" ...
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<HYPHENATED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.generateParseException(FlinkSqlParserImpl.java:39782)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.jj_consume_token(FlinkSqlParserImpl.java:39593)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.TableColumn(FlinkSqlParserImpl.java:4835)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlCreateTable(FlinkSqlParserImpl.java:5209)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlCreateExtended(FlinkSqlParserImpl.java:6233)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlCreate(FlinkSqlParserImpl.java:20934)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlStmt(FlinkSqlParserImpl.java:3415)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlStmtEof(FlinkSqlParserImpl.java:3918)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.parseSqlStmtEof(FlinkSqlParserImpl.java:261)
at org.apache.calcite.sql.parser.SqlParser.parseQuery(SqlParser.java:153)
... 15 more

Remove the comma behind "column_b INT,".
This should work.

Related

Converting pandas table to Table API to DataStream using PyFlink

Hello I am new to PyFlink. I try to convert a pandas table including strings to a DataStream object in Flink.
Take the following as an example code:
from pyflink.datastream import *
from pyflink.table import *
import pandas as pd
import numpy as np
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(env)
env.set_parallelism(1)
# Create a Pandas DataFrame
#pdf = pd.DataFrame(np.random.rand(1000, 5))
pdf = pd.DataFrame(["abc", "def"])
# Create a PyFlink Table from a Pandas DataFrame
table = t_env.from_pandas(pdf)
table.execute().print()
# interpret the insert-only Table as a DataStream again
res_ds = t_env.to_data_stream(table)
# add a printing sink and execute in DataStream API
res_ds.print()
env.execute()
I get the following error:
Caused by: java.lang.ClassCastException: class org.apache.flink.table.data.binary.BinaryStringData cannot be cast to class java.lang.String (org.apache.flink.table.data.binary.BinaryStringData is in unnamed module of loader 'app'; java.lang.String is in module java.base of loader 'bootstrap')
The problem seems to be the conversion from the table to the datastream object. The first print function is executed.
Output:
$ python PandasConv.py
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker$1 (file:/home/user/Flink/Python_Projects/flinkenv/lib/python3.8/site-packages/pyflink/opt/flink-python-1.16.0.jar) to method java.time.ZoneRegion.getId()
WARNING: Please consider reporting this to the maintainers of org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker$1
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
+----+--------------------------------+
| op | 0 |
+----+--------------------------------+
| +I | abc |
| +I | def |
+----+--------------------------------+
2 rows in set
Traceback (most recent call last):
File "PandasConv.py", line 25, in <module>
env.execute()
File "/home/user/Flink/Python_Projects/flinkenv/lib/python3.8/site-packages/pyflink/datastream/stream_execution_environment.py", line 764, in execute
return JobExecutionResult(self._j_stream_execution_environment.execute(j_stream_graph))
File "/home/user/Flink/Python_Projects/flinkenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "/home/user/Flink/Python_Projects/flinkenv/lib/python3.8/site-packages/pyflink/util/exceptions.py", line 146, in deco
return f(*a, **kw)
File "/home/user/Flink/Python_Projects/flinkenv/lib/python3.8/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o46.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:141)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:642)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandler.java:268)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1277)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47)
at akka.dispatch.OnComplete.internal(Future.scala:300)
at akka.dispatch.OnComplete.internal(Future.scala:297)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:224)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:221)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$DirectExecutionContext.execute(AkkaFutureUtils.java:65)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68)
at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:284)
at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:284)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:284)
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:621)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:24)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:23)
at scala.concurrent.Future.$anonfun$andThen$1(Future.scala:532)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:29)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:29)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63)
at akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:100)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81)
at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:100)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48)
at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:139)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:83)
at org.apache.flink.runtime.scheduler.DefaultScheduler.recordTaskFailure(DefaultScheduler.java:256)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:247)
at org.apache.flink.runtime.scheduler.DefaultScheduler.onTaskFailed(DefaultScheduler.java:240)
at org.apache.flink.runtime.scheduler.SchedulerBase.onTaskExecutionStateUpdate(SchedulerBase.java:738)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:715)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:78)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:477)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRpcInvocation$1(AkkaRpcActor.java:309)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:307)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:222)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:84)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:168)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:537)
at akka.actor.Actor.aroundReceive$(Actor.scala:535)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
at akka.actor.ActorCell.invoke(ActorCell.scala:548)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
at akka.dispatch.Mailbox.run(Mailbox.scala:231)
at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
... 5 more
Caused by: java.lang.ClassCastException: class org.apache.flink.table.data.binary.BinaryStringData cannot be cast to class java.lang.String (org.apache.flink.table.data.binary.BinaryStringData is in unnamed module of loader 'app'; java.lang.String is in module java.base of loader 'bootstrap')
at org.apache.flink.table.runtime.typeutils.serializers.python.StringSerializer.serialize(StringSerializer.java:41)
at org.apache.flink.api.java.typeutils.runtime.RowSerializer.serializePositionBased(RowSerializer.java:306)
at org.apache.flink.api.java.typeutils.runtime.RowSerializer.serialize(RowSerializer.java:280)
at org.apache.flink.api.java.typeutils.runtime.RowSerializer.serialize(RowSerializer.java:72)
at org.apache.flink.api.java.typeutils.runtime.RowSerializer.serializePositionBased(RowSerializer.java:306)
at org.apache.flink.api.java.typeutils.runtime.RowSerializer.serialize(RowSerializer.java:280)
at org.apache.flink.api.java.typeutils.runtime.RowSerializer.serialize(RowSerializer.java:72)
at org.apache.flink.streaming.api.operators.python.process.AbstractExternalOneInputPythonFunctionOperator.processElement(AbstractExternalOneInputPythonFunctionOperator.java:142)
at org.apache.flink.streaming.api.operators.python.process.ExternalPythonProcessOperator.processElement(ExternalPythonProcessOperator.java:111)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:82)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:57)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:29)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:56)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:29)
at org.apache.flink.table.runtime.operators.sink.OutputConversionOperator.processElement(OutputConversionOperator.java:105)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:82)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:57)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:29)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:56)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:29)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$ManualWatermarkContext.processAndCollect(StreamSourceContexts.java:418)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$WatermarkContext.collect(StreamSourceContexts.java:513)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$SwitchingOnClose.collect(StreamSourceContexts.java:103)
at org.apache.flink.table.runtime.arrow.sources.ArrowSourceFunction.run(ArrowSourceFunction.java:200)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:110)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:67)
at org.apache.flink.streaming.runtime.tasks.SourceStreamTask$LegacySourceFunctionThread.run(SourceStreamTask.java:333)
I tried to cast explicit, e.g. by "dtype='String'" or in Flink "DataType.STRING()". Problem not solved.
I have read the following article https://nightlies.apache.org/flink/flink-docs-release-1.13/docs/ops/debugging/debugging_classloading/#x-cannot-be-cast-to-x-exceptions
But I do not know where to find the config file.

pyflink TableException: Failed to execute sql

I use pyflink to run flink streaming, if I run flink with StandAlone mode, it works, but run flink with yarn-per-job mode, it failed, report "pyflink.util.exceptions.TableException: Failed to execute sql"
yarn per job command is: flink run -t yarn-per-job -Djobmanager.memory.process.size=1024mb -Dtaskmanager.memory.process.size=2048mb -ynm flink-cluster -Dtaskmanager.numberOfTaskSlots=2 -pyfs cluster.py ...
standalone command is: flink run -pyfs cluster.py ...
The python environment archive attached in cluster.py.
env = StreamExecutionEnvironment.get_execution_environment()
env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
t_env = StreamTableEnvironment.create(env, environment_settings=env_settings)
curr_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
jars = f"""
file://{curr_path}/jars/flink-sql-connector-kafka_2.11-1.13.1.jar;
file://{curr_path}/jars/force-shading-1.13.1.jar"""
t_env.get_config().get_configuration().set_string("pipeline.jars", jars)
t_env.add_python_archive("%s/requirements/flink.zip" % curr_path)
t_env.get_config().set_python_executable("flink.zip/flink/bin/python")
env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
env.set_parallelism(2)
env.get_config().set_auto_watermark_interval(10000)
t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)
parse_log = udaf(LogParser(parsing_params),
input_types=[DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(),
DataTypes.STRING(), DataTypes.TIMESTAMP(3)],
result_type=DataTypes.STRING(), func_type="pandas")
process_ad = udf(ADProcessor(ad_params), result_type=DataTypes.STRING())
t_env.create_temporary_function('log_parsing_process', parse_log)
t_env.create_temporary_function('ad_process', process_ad)
tumble_window = Tumble.over("5.minutes").on("time_ltz").alias("w")
t_env.execute_sql(f"""
CREATE TABLE source_table(
ip VARCHAR, -- ip address
raws VARCHAR, -- message
host VARCHAR, -- host
log_type VARCHAR, -- type
system_name VARCHAR, -- system
ts BIGINT,
time_ltz AS TO_TIMESTAMP_LTZ(ts, 3),
WATERMARK FOR time_ltz AS time_ltz - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = '{source_topic}',
'properties.bootstrap.servers' = '{source_servers}',
'properties.group.id' = '{group_id}',
'scan.startup.mode' = '{auto_offset_reset}',
'format' = 'json'
)
""")
sink_sql = f"""
CREATE TABLE sink (
alert VARCHAR, -- alert
start_time timestamp(3), -- window start timestamp
end_time timestamp(3) -- window end timestamp
) with (
'connector' = 'kafka',
'topic' = '{sink_topic}',
'properties.bootstrap.servers' = '{sink_servers}',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true',
'format' = 'json'
)"""
t_env.execute_sql(sink_sql)
t_env.get_config().set_null_check(False)
source_table = t_env.from_path('source_table')
sink_table = source_table.window(tumble_window) \
.group_by("w, log_type") \
.select("log_parsing_process(ip, raws, host, log_type, system_name, time_ltz) AS pattern, "
"w.start AS start_time, "
"w.end AS end_time") \
.select("ad_process(pattern, start_time, end_time) AS alert, start_time, end_time")
sink_table.execute_insert("sink")
Error is:
File "/tmp/pyflink/xxxx/xxxx/workerbee/log_exception_detection_run_on_diff_mode.py ,line 148, in run_flink sink_table_execute_insert("test_sink")
File "/opt/flink/flink-1.13.1_scala_2.12/opt/python/pyflink.zip/pyflink/table/table.py, line 1056 in execute_insert
File "/opt/flink/flink-1.13.1_scala_2.12/opt/python/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1286, in __call__
File "/opt/flink/flink-1.13.1_scala_2.12/opt/python/pyflink.zip/pyflink/util/exceptions.py", line 163, in deco
pyflink.util.exceptions.TableException: Failed to execute sql
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:777)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:742)
at org.apache.flink.table.api.internal.TableImpl.executeInsert(TableImpl.java:572)
at sun.reflect.NativeMetondAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMetondAccessorImpl.invoke(NativeMethodAccessorImpl.hava:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.hava:498)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
org.apache.flink.client.program.ProgramAbortException: java.lang.RuntimeException: Python process exits with code: 1
nodemanager log:
INFO org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: launchContainer: [bash, /opt/hadoop_data/tmp/nm-local-dir/usercache/root/appcache/applicatino_I1644370510310_0002/container_I1644370510310_0002_03_000001/default_container_executor.sh]
WARN org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: Exit code from container container_I1644370510310_0002_03_000001 is : 1
WARN org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: Exception from container-launch with container ID: container_I1644370510310_0002_03_000001 and exit exit code: 1
ExitCodeException exitCode=1:
at org.apache.hadoop.util.Shell.runCommand(Shell.java: 1008)
at org.apache.hadoop.util.Shell.run(Shell.java: 901)
at org.apache.hadoop.util.Shell$ShellCommandExceutor.execute(Shell.java:1213
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:309)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.launchContainer(ContainerLaunch.java:585)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.Call(ContainerLaunch.java:373)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.Call(ContainerLaunch.java:103)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPollExecutor.runWorker(ThreadPollExecutor.java:1149)
at java.util.concurrent.ThreadPollExecutor$Worker.run(ThreadPollExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
INFO org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor: Exception from container-launch.
INFO org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor: container id: container_I1644370510310_0002_03_000001
INFO org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor: Exit code: 1
WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch: Container launch failed : Container exited with a non-zero exit code 1
INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_I1644370510310_0002_03_000001 transitioned from RUNNING to EXITED_WITH_FAILURE
Looks like a classloader related issue. classloader.check-leaked-classloader configuration can refer to https://nightlies.apache.org/flink/flink-docs-master/zh/docs/deployment/config/
In addition, you can try to use add_jar api instead of setting pipeline.jars config directly
def add_jars(self, *jars_path: str):
"""
Adds a list of jar files that will be uploaded to the cluster and referenced by the job.
:param jars_path: Path of jars.
"""
add_jars_to_context_class_loader(jars_path)
jvm = get_gateway().jvm
jars_key = jvm.org.apache.flink.configuration.PipelineOptions.JARS.key()
env_config = jvm.org.apache.flink.python.util.PythonConfigUtil \
.getEnvironmentConfig(self._j_stream_execution_environment)
old_jar_paths = env_config.getString(jars_key, None)
joined_jars_path = ';'.join(jars_path)
if old_jar_paths and old_jar_paths.strip():
joined_jars_path = ';'.join([old_jar_paths, joined_jars_path])
env_config.setString(jars_key, joined_jars_path)
after debug and check, finally I found the issue is I missed some flink hadoop jar packages:
commons-cli-1.4.jar
flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar
hadoop-yarn-api-3.3.1.jar

pyflink use kafka_source_ddl an error occurred

question:
i want to use pyflink read kafka msg
run commad./bin/flink run -m 10.0.24.13:8081 -py /usr/local/project/cdn_flink/cdn_demo.py ,show error:
File "/usr/local/flink-1.14.2/opt/python/pyflink.zip/pyflink/table/table.py", line 1108, in execute_insert
File "/usr/local/flink-1.14.2/opt/python/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1286, in __call__
File "/usr/local/flink-1.14.2/opt/python/pyflink.zip/pyflink/util/exceptions.py", line 158, in deco
pyflink.util.exceptions.TableException: org.apache.flink.table.api.TableException: findAndCreateTableSink failed.
at org.apache.flink.table.factories.TableFactoryUtil.findAndCreateTableSink(TableFactoryUtil.java:88)
at org.apache.flink.table.factories.TableFactoryUtil.lambda$findAndCreateTableSink$0(TableFactoryUtil.java:116)
at java.util.Optional.orElseGet(Optional.java:267)
at org.apache.flink.table.factories.TableFactoryUtil.findAndCreateTableSink(TableFactoryUtil.java:116)
at org.apache.flink.table.planner.delegation.PlannerBase.getTableSink(PlannerBase.scala:379)
at org.apache.flink.table.planner.delegation.PlannerBase.translateToRel(PlannerBase.scala:222)
at org.apache.flink.table.planner.delegation.PlannerBase$$anonfun$1.apply(PlannerBase.scala:182)
at org.apache.flink.table.planner.delegation.PlannerBase$$anonfun$1.apply(PlannerBase.scala:182)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.flink.table.planner.delegation.PlannerBase.translate(PlannerBase.scala:182)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.translate(TableEnvironmentImpl.java:1665)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:752)
at org.apache.flink.table.api.internal.TableImpl.executeInsert(TableImpl.java:574)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.table.api.TableException: Could not load service provider for table factories.
at org.apache.flink.table.factories.TableFactoryService.discoverFactories(TableFactoryService.java:212)
my env
flink-1.14.2
kafka_2.12-3.0.0
zookeeper-3.7.0
apache-flink 1.14.2
python 3.6.8
all process is running:
7600 StandaloneSessionClusterEntrypoint
13316 TaskManagerRunner # flink
23878 QuorumPeerMain # zk
15705 ConsoleProducer
29721 Jps
31454 Kafka
my code
cdn_connector_ddl.py
# --coding=utf8 --
kafka_source_ddl = """
CREATE TABLE cdn_access_log (
uuid VARCHAR,
client_ip VARCHAR,
request_time BIGINT,
response_size BIGINT
) WITH (
'connector' = 'kafka',
'topic' = 'cdn_access_log',
'properties.bootstrap.servers' = '10.0.24.13:9091',
'scan.startup.mode' = 'earliest-offset',
'format' = 'csv',
'csv.field-delimiter' = ','
)
"""
mysql_sink_ddl = """
CREATE TABLE cdn_access_statistic (
province VARCHAR,
access_count BIGINT,
total_download BIGINT,
download_speed DOUBLE
) WITH (
'connector.type' = 'jdbc',
'connector.url' = 'jdbc:mysql://localhost:3306/hive?autoReconnect=true&failOverReadOnly=false&useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=GMT%2B8',
'connector.table' = 'cdn_access_statistic',
'connector.username' = 'hive',
'connector.password' = 'hive1234',
'connector.write.flush.interval' = '1s'
)
cdn_demo.py
# --coding=utf8 --
import os
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, TableConfig
from cdn_connector_ddl import kafka_source_ddl, mysql_sink_ddl
def start():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
# add jar
_path = "file:////usr/local/lib64/python3.6/site-packages/pyflink/lib"
env.add_jars(os.path.join(_path, 'mysql-connector-java-8.0.27.jar'))
env.add_jars(os.path.join(_path, 'flink-sql-connector-kafka_2.12-1.14.2.jar'))
env.add_jars(os.path.join(_path, 'kafka-clients-3.0.0.jar'))
env.add_jars(os.path.join(_path, 'flink-csv-1.14.2-sql-jar.jar'))
env.add_jars(os.path.join(_path, 'flink-connector-kafka_2.12-1.14.2.jar'))
# t_env = StreamTableEnvironment.create(env, TableConfig())
t_env = StreamTableEnvironment.create(
env,
environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
# set source table
t_env.execute_sql(kafka_source_ddl)
t_env.execute_sql(mysql_sink_ddl)
t_env.from_path("cdn_access_log") \
.select("uuid, "
"client_ip as province, "
"response_size, request_time")\
.group_by("province")\
.select(
"province, count(uuid) as access_count, "
"sum(response_size) as total_download, "
"sum(response_size) * 1.0 / sum(request_time) as download_speed") \
.execute_insert("cdn_access_statistic")
t_env.execute("cdn_access_log")
if __name__=='__main__':
start()
i don't know how to solve,maybe use old flink version? pls help me,thanks
The error shows that it can't find suitable table sink
pyflink.util.exceptions.TableException: org.apache.flink.table.api.TableException: findAndCreateTableSink failed.
Two ideas for reference
Check if flink-connector-jdbc.jar is loaded,I see you just loaded mysql-connector-java-8.0.27.jar
Check jdbc connector option, don't use connector.xxx, may be you can reference to https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/connectors/table/jdbc/

PyFlink java.io.EOFException at java.io.DataInputStream.readFully

I have a PyFlink job that reads from a file, filter based on a condition, and print. This is a tree view of my working directory. This is the PyFlink script main.py:
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.udf import udf
# https://flink.apache.org/2020/04/09/pyflink-udf-support-flink.html
# https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/python/table-api-users-guide/udfs/python_udfs.html
#udf(input_types=[DataTypes.STRING()], result_type=DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()))
def parse(s):
import json
# a dummy parser
res = {'item_id': 123, 'tag': 'a'}
return res
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(env)
t_env.register_function("parse", parse)
my_source_ddl = """
create table mySource (
id BIGINT,
contentstr STRING
) with (
'connector' = 'filesystem',
'format' = 'json',
'path' = '/tmp/input'
)
"""
my_sink_ddl = """
create table mySink (
id BIGINT
) with (
'connector' = 'print'
)
"""
my_transform_dml = """
insert into mySink
with t1 as (
select id, parse(contentstr) as content
from mySource
)
select id
from t1
where content['item_id'] is not null
and content['tag'] = 'a'
"""
t_env.execute_sql(my_source_ddl)
t_env.execute_sql(my_sink_ddl)
t_env.execute_sql(my_transform_dml).wait()
To run the main.py:
Ensure installing pyflink==1.12.0 in my conda env
/tmp/input has a single row of content {"id":1,"tag":"a"}
Then I run main.py and I get the exception:
Traceback (most recent call last):
File "udf_parse.py", line 53, in <module>
t_env.execute_sql(my_transform_dml).wait()
File "/usr/local/anaconda3/envs/pyflink-quickstart/lib/python3.7/site-packages/pyflink/table/table_result.py", line 76, in wait
get_method(self._j_table_result, "await")()
File "/usr/local/anaconda3/envs/pyflink-quickstart/lib/python3.7/site-packages/py4j/java_gateway.py", line 1286, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/usr/local/anaconda3/envs/pyflink-quickstart/lib/python3.7/site-packages/pyflink/util/exceptions.py", line 147, in deco
return f(*a, **kw)
File "/usr/local/anaconda3/envs/pyflink-quickstart/lib/python3.7/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o53.await.
: java.util.concurrent.ExecutionException: org.apache.flink.table.api.TableException: Failed to wait job finish
at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908)
at org.apache.flink.table.api.internal.TableResultImpl.awaitInternal(TableResultImpl.java:123)
at org.apache.flink.table.api.internal.TableResultImpl.await(TableResultImpl.java:86)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.table.api.TableException: Failed to wait job finish
at org.apache.flink.table.api.internal.InsertResultIterator.hasNext(InsertResultIterator.java:56)
at org.apache.flink.table.api.internal.TableResultImpl$CloseableRowIteratorWrapper.hasNext(TableResultImpl.java:350)
at org.apache.flink.table.api.internal.TableResultImpl$CloseableRowIteratorWrapper.isFirstRowReady(TableResultImpl.java:363)
at org.apache.flink.table.api.internal.TableResultImpl.lambda$awaitInternal$1(TableResultImpl.java:110)
at java.util.concurrent.CompletableFuture$AsyncRun.run(CompletableFuture.java:1640)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.util.concurrent.ExecutionException: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908)
at org.apache.flink.table.api.internal.InsertResultIterator.hasNext(InsertResultIterator.java:54)
... 7 more
Caused by: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$2(MiniClusterJobClient.java:117)
at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$0(AkkaInvocationHandler.java:237)
at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
at org.apache.flink.runtime.concurrent.FutureUtils$1.onComplete(FutureUtils.java:1046)
at akka.dispatch.OnComplete.internal(Future.scala:264)
at akka.dispatch.OnComplete.internal(Future.scala:261)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:191)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:188)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
at org.apache.flink.runtime.concurrent.Executors$DirectExecutionContext.execute(Executors.java:73)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:44)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:252)
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:572)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:22)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:21)
at scala.concurrent.Future$$anonfun$andThen$1.apply(Future.scala:436)
at scala.concurrent.Future$$anonfun$andThen$1.apply(Future.scala:435)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:91)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:90)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:44)
at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:118)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:80)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:233)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:224)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:215)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:669)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:89)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:447)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:305)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:212)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
at akka.actor.ActorCell.invoke(ActorCell.scala:561)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
at akka.dispatch.Mailbox.run(Mailbox.scala:225)
at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
... 4 more
Caused by: java.io.EOFException
at java.io.DataInputStream.readFully(DataInputStream.java:197)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at org.apache.flink.table.runtime.typeutils.StringDataSerializer.deserializeInternal(StringDataSerializer.java:88)
at org.apache.flink.table.runtime.typeutils.StringDataSerializer.deserialize(StringDataSerializer.java:82)
at org.apache.flink.table.runtime.typeutils.StringDataSerializer.deserialize(StringDataSerializer.java:34)
at org.apache.flink.table.runtime.typeutils.serializers.python.MapDataSerializer.deserializeInternal(MapDataSerializer.java:129)
at org.apache.flink.table.runtime.typeutils.serializers.python.MapDataSerializer.deserialize(MapDataSerializer.java:110)
at org.apache.flink.table.runtime.typeutils.serializers.python.MapDataSerializer.deserialize(MapDataSerializer.java:46)
at org.apache.flink.table.runtime.typeutils.serializers.python.RowDataSerializer.deserialize(RowDataSerializer.java:106)
at org.apache.flink.table.runtime.typeutils.serializers.python.RowDataSerializer.deserialize(RowDataSerializer.java:49)
at org.apache.flink.table.runtime.operators.python.scalar.RowDataPythonScalarFunctionOperator.emitResult(RowDataPythonScalarFunctionOperator.java:81)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.emitResults(AbstractPythonFunctionOperator.java:250)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.invokeFinishBundle(AbstractPythonFunctionOperator.java:273)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.processWatermark(AbstractPythonFunctionOperator.java:199)
at org.apache.flink.streaming.runtime.tasks.ChainingOutput.emitWatermark(ChainingOutput.java:123)
at org.apache.flink.streaming.runtime.tasks.SourceOperatorStreamTask$AsyncDataOutputToOutput.emitWatermark(SourceOperatorStreamTask.java:170)
at org.apache.flink.streaming.runtime.tasks.SourceOperatorStreamTask.advanceToEndOfEventTime(SourceOperatorStreamTask.java:110)
at org.apache.flink.streaming.runtime.tasks.SourceOperatorStreamTask.afterInvoke(SourceOperatorStreamTask.java:116)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:589)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:755)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:570)
at java.lang.Thread.run(Thread.java:748)
The issue is probably related to the udf. Any help? Thanks!
The root cause is:
#udf(input_types=[DataTypes.STRING()], result_type=DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()))
def parse(s):
import json
# a dummy parser
res = {'item_id': 123, 'tag': 'a'}
return res
The result_type is DataTypes.MAP(DataTypes.STRING(), DataTypes.STRING()), but item_id has value type integer.
A fix is simple:
def parse(s):
import json
# a dummy parser
res = {'item_id': 123, 'tag': 'a'}
res = {str(k): str(v) for k, v in res.items()}
return res

PyFlink - Issue using Scala UDF in JAR

I'm trying to register a Scala UDF in Pyflink using an external JAR as follows, but get below error.
Scala UDF:
package com.dummy
import org.apache.flink.table.functions.ScalarFunction
class dummyTransform(factor: Int) extends ScalarFunction {
def eval(s: String): Int = {
s.hashCode()
}
}
build.sbt:
name := "hello_scala_for_flink"
version := "0.1"
scalaVersion := "2.12.11"
libraryDependencies += "org.apache.flink" % "flink-table-common" % "1.11.2" % "provided"
assembly.sbt:
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
Scala project structure:
hello_scala_for_flink/
project/
src/
main/
resources/
scala/
com.dummy/
dummyTransform
hello_scala
test
target/
build.sbt
shell:
cd hello_scala_for_flink
sbt assembly
cp ./target/scala-2.12/hello_scala_for_flink-assembly-0.1.jar /Users/py-r/opt/anaconda3/envs/venv_pyflink_37/lib/python3.7/site-packages/pyflink/lib/
Python:
from pyflink.dataset import ExecutionEnvironment
exec_env = ExecutionEnvironment.get_execution_environment()
#exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
table_env = BatchTableEnvironment.create(exec_env, t_config)
table_env.register_java_function("hash_code","com.dummy.dummyTransform")
Error (incl. after restarting Anaconda):
Py4JJavaError: An error occurred while calling o12.newInstance.
: java.lang.InstantiationException: com.dummy.dummyTransform
at java.base/java.lang.Class.newInstance(Class.java:598)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.lang.NoSuchMethodException: com.dummy.dummyTransform.<init>()
at java.base/java.lang.Class.getConstructor0(Class.java:3427)
at java.base/java.lang.Class.newInstance(Class.java:585)
11 more
Versions:
jdk = 1.8.0_151.jdk
scala = 2.12.11
python = 3.7
apache-beam = 2.19.0
apache-flink = 1.11.2
Any idea what the issue might be ?
Thanks for your support
Looks like I found the issue myself. Apparently only a class instantiation was required in above code:
class dummyTransform(factor: Int) extends ScalarFunction {
def eval(s: String): Int = {
s.hashCode() * factor
}
def this() = this(1)
}
Also, I changed to Scala 2.11.12 due to another error. Now everything seems to work: awesome !

Resources