pyflink TableException: Failed to execute sql - apache-flink

I use pyflink to run flink streaming, if I run flink with StandAlone mode, it works, but run flink with yarn-per-job mode, it failed, report "pyflink.util.exceptions.TableException: Failed to execute sql"
yarn per job command is: flink run -t yarn-per-job -Djobmanager.memory.process.size=1024mb -Dtaskmanager.memory.process.size=2048mb -ynm flink-cluster -Dtaskmanager.numberOfTaskSlots=2 -pyfs cluster.py ...
standalone command is: flink run -pyfs cluster.py ...
The python environment archive attached in cluster.py.
env = StreamExecutionEnvironment.get_execution_environment()
env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
t_env = StreamTableEnvironment.create(env, environment_settings=env_settings)
curr_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
jars = f"""
file://{curr_path}/jars/flink-sql-connector-kafka_2.11-1.13.1.jar;
file://{curr_path}/jars/force-shading-1.13.1.jar"""
t_env.get_config().get_configuration().set_string("pipeline.jars", jars)
t_env.add_python_archive("%s/requirements/flink.zip" % curr_path)
t_env.get_config().set_python_executable("flink.zip/flink/bin/python")
env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
env.set_parallelism(2)
env.get_config().set_auto_watermark_interval(10000)
t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)
parse_log = udaf(LogParser(parsing_params),
input_types=[DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(),
DataTypes.STRING(), DataTypes.TIMESTAMP(3)],
result_type=DataTypes.STRING(), func_type="pandas")
process_ad = udf(ADProcessor(ad_params), result_type=DataTypes.STRING())
t_env.create_temporary_function('log_parsing_process', parse_log)
t_env.create_temporary_function('ad_process', process_ad)
tumble_window = Tumble.over("5.minutes").on("time_ltz").alias("w")
t_env.execute_sql(f"""
CREATE TABLE source_table(
ip VARCHAR, -- ip address
raws VARCHAR, -- message
host VARCHAR, -- host
log_type VARCHAR, -- type
system_name VARCHAR, -- system
ts BIGINT,
time_ltz AS TO_TIMESTAMP_LTZ(ts, 3),
WATERMARK FOR time_ltz AS time_ltz - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = '{source_topic}',
'properties.bootstrap.servers' = '{source_servers}',
'properties.group.id' = '{group_id}',
'scan.startup.mode' = '{auto_offset_reset}',
'format' = 'json'
)
""")
sink_sql = f"""
CREATE TABLE sink (
alert VARCHAR, -- alert
start_time timestamp(3), -- window start timestamp
end_time timestamp(3) -- window end timestamp
) with (
'connector' = 'kafka',
'topic' = '{sink_topic}',
'properties.bootstrap.servers' = '{sink_servers}',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true',
'format' = 'json'
)"""
t_env.execute_sql(sink_sql)
t_env.get_config().set_null_check(False)
source_table = t_env.from_path('source_table')
sink_table = source_table.window(tumble_window) \
.group_by("w, log_type") \
.select("log_parsing_process(ip, raws, host, log_type, system_name, time_ltz) AS pattern, "
"w.start AS start_time, "
"w.end AS end_time") \
.select("ad_process(pattern, start_time, end_time) AS alert, start_time, end_time")
sink_table.execute_insert("sink")
Error is:
File "/tmp/pyflink/xxxx/xxxx/workerbee/log_exception_detection_run_on_diff_mode.py ,line 148, in run_flink sink_table_execute_insert("test_sink")
File "/opt/flink/flink-1.13.1_scala_2.12/opt/python/pyflink.zip/pyflink/table/table.py, line 1056 in execute_insert
File "/opt/flink/flink-1.13.1_scala_2.12/opt/python/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1286, in __call__
File "/opt/flink/flink-1.13.1_scala_2.12/opt/python/pyflink.zip/pyflink/util/exceptions.py", line 163, in deco
pyflink.util.exceptions.TableException: Failed to execute sql
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:777)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:742)
at org.apache.flink.table.api.internal.TableImpl.executeInsert(TableImpl.java:572)
at sun.reflect.NativeMetondAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMetondAccessorImpl.invoke(NativeMethodAccessorImpl.hava:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.hava:498)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
org.apache.flink.client.program.ProgramAbortException: java.lang.RuntimeException: Python process exits with code: 1
nodemanager log:
INFO org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: launchContainer: [bash, /opt/hadoop_data/tmp/nm-local-dir/usercache/root/appcache/applicatino_I1644370510310_0002/container_I1644370510310_0002_03_000001/default_container_executor.sh]
WARN org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: Exit code from container container_I1644370510310_0002_03_000001 is : 1
WARN org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor: Exception from container-launch with container ID: container_I1644370510310_0002_03_000001 and exit exit code: 1
ExitCodeException exitCode=1:
at org.apache.hadoop.util.Shell.runCommand(Shell.java: 1008)
at org.apache.hadoop.util.Shell.run(Shell.java: 901)
at org.apache.hadoop.util.Shell$ShellCommandExceutor.execute(Shell.java:1213
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:309)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.launchContainer(ContainerLaunch.java:585)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.Call(ContainerLaunch.java:373)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.Call(ContainerLaunch.java:103)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPollExecutor.runWorker(ThreadPollExecutor.java:1149)
at java.util.concurrent.ThreadPollExecutor$Worker.run(ThreadPollExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
INFO org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor: Exception from container-launch.
INFO org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor: container id: container_I1644370510310_0002_03_000001
INFO org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor: Exit code: 1
WARN org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch: Container launch failed : Container exited with a non-zero exit code 1
INFO org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl: Container container_I1644370510310_0002_03_000001 transitioned from RUNNING to EXITED_WITH_FAILURE

Looks like a classloader related issue. classloader.check-leaked-classloader configuration can refer to https://nightlies.apache.org/flink/flink-docs-master/zh/docs/deployment/config/
In addition, you can try to use add_jar api instead of setting pipeline.jars config directly
def add_jars(self, *jars_path: str):
"""
Adds a list of jar files that will be uploaded to the cluster and referenced by the job.
:param jars_path: Path of jars.
"""
add_jars_to_context_class_loader(jars_path)
jvm = get_gateway().jvm
jars_key = jvm.org.apache.flink.configuration.PipelineOptions.JARS.key()
env_config = jvm.org.apache.flink.python.util.PythonConfigUtil \
.getEnvironmentConfig(self._j_stream_execution_environment)
old_jar_paths = env_config.getString(jars_key, None)
joined_jars_path = ';'.join(jars_path)
if old_jar_paths and old_jar_paths.strip():
joined_jars_path = ';'.join([old_jar_paths, joined_jars_path])
env_config.setString(jars_key, joined_jars_path)

after debug and check, finally I found the issue is I missed some flink hadoop jar packages:
commons-cli-1.4.jar
flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar
hadoop-yarn-api-3.3.1.jar

Related

Apache Flink - streaming app doesn't start from checkpoint after stop and start

I have the following Flink streaming application running locally, written with the SQL API:
object StreamingKafkaJsonsToCsvLocalFs {
val brokers = "localhost:9092"
val topic = "test-topic"
val consumerGroupId = "test-consumer"
val kafkaTableName = "KafKaTable"
val targetTable = "TargetCsv"
val targetPath = f"file://${new java.io.File(".").getCanonicalPath}/kafka-to-fs-csv"
def generateKafkaTableDDL(): String = {
s"""
|CREATE TABLE $kafkaTableName (
| `kafka_offset` BIGINT METADATA FROM 'offset',
| `seller_id` STRING
|) WITH (
| 'connector' = 'kafka',
| 'topic' = '$topic',
| 'properties.bootstrap.servers' = 'localhost:9092',
| 'properties.group.id' = '$consumerGroupId',
| 'scan.startup.mode' = 'earliest-offset',
| 'format' = 'json'
|)
|""".stripMargin
}
def generateTargetTableDDL(): String = {
s"""
|CREATE TABLE $targetTable (
| `kafka_offset` BIGINT,
| `seller_id` STRING
| )
|WITH (
| 'connector' = 'filesystem',
| 'path' = '$targetPath',
| 'format' = 'csv',
| 'sink.rolling-policy.rollover-interval' = '10 seconds',
| 'sink.rolling-policy.check-interval' = '1 seconds'
|)
|""".stripMargin
}
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
env.enableCheckpointing(1000)
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
env.getCheckpointConfig.setCheckpointStorage(s"$targetPath/checkpoints")
val settings = EnvironmentSettings.newInstance()
.inStreamingMode()
.build()
val tblEnv = StreamTableEnvironment.create(env, settings)
tblEnv.executeSql(generateKafkaTableDDL())
tblEnv.executeSql(generateTargetTableDDL())
tblEnv.from(kafkaTableName).executeInsert(targetTable).await()
tblEnv.executeSql("kafka-json-to-fs")
}
}
As you can see, the checkpointing is enabled and when I execute this application I see that the checkpoint folder is created and populated.
The problem that I am facing with is -- when I stop&start my application (from the IDE) I expect it to start from the same point it stopped in the previous execution but instead I see that it consumes all the offsets from the earliest offset in the topic (I see it from the new generated output files that contain zero offset although the previous run processed those offsets).
What am I missing about checkpointing in Flink? I would expect it to be exactly once.
Flink only restarts from a checkpoint when recovering from a failure, or when explicitly restarted from a retained checkpoint via the command line or REST API. Otherwise, the KafkaSource starts from the offsets configured in the code, which defaults to the earliest offsets.
If you have no other state, you could instead rely on the committed offsets as the source of truth, and configure the Kafka connector to use the committed offsets as the starting position.
Flink's fault tolerance via checkpointing isn't designed to support mini-cluster deployments like the one used when running in an IDE. Normally the job manager and task managers are running in separate processes, and the job manager can detect that a task manager has failed, and can arrange for a restart.

PyFlink 14.2 - Table API DDL - Semantic Exactly Once

I've had the scenario where I define a kafka source, UDF | UDTF for processing and sink to a Kafka sink. Doesn't matter what I do, if I run the job, the output is flood with the processed output of a single input record. For illustrative purposes, this is what's output on the defined kafka sink topic:
distinct timestamps, showing that the UDF is entered for each respective input record, but the same input record was processed:
By trying to figure out the problem I've read through whatever flink documentation I could find (and rabbit hole of links) in terms of enforcing 'semantic EXACTLY ONCE' processing of records. As far as I can gather it comes down to these following settings:
This guy presented the best visual representation for me to understand semantic_once_video
Kafka source (consumer)
kafka source property of isolation level = read_committed
Kafka sinks (producer)
Kafka sink property of processing mode = exactly_once
Kafka sink property of idempotence = true
Utilizing checkpointing
Also referencing stackoverflow questions I could find on the topic (mainly discussing in terms of Java implementations)... needless to say, still not resolved. Here's my code for reference:
import os
from pyflink.datastream.stream_execution_environment import StreamExecutionEnvironment
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, StreamTableEnvironment
from pyflink.table.udf import ScalarFunction, TableFunction, udf, udtf
from pyflink.datastream.checkpointing_mode import CheckpointingMode
KAFKA_SERVERS = os.getenv('KAFKA_BS_SERVERS',"localhost:9094").split(',')
KAFKA_USERNAME = "xxx"
KAFKA_PASSWORD = "_pass_"
KAFKA_SOURCE_TOPIC = 'source_topic'
KAFKA_SINK_TOPIC = 'sink_topic'
KAFKA_GROUP_ID = 'testgroup12'
JAR_DEPENDENCIES = os.getenv('JAR_DEPENDENCIES', '/opt/flink/lib_py')
class tbl_function(TableFunction):
def open(self, function_context):
pass
def eval(self, *args):
import json
from datetime import datetime
res = {
'time': str(datetime.utcnow()),
'input': json.loads(args[0])
}
yield json.dumps(res)
def pipeline():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
for file in os.listdir(JAR_DEPENDENCIES):
if file.find('.jar') != -1:
env.add_jars(f"file://{JAR_DEPENDENCIES}/{file}")
print(f"added jar dep: {JAR_DEPENDENCIES}/{file}")
env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_min_pause_between_checkpoints(120000)
env.get_checkpoint_config().enable_unaligned_checkpoints()
env.get_checkpoint_config().set_checkpoint_interval(30000)
settings = EnvironmentSettings.new_instance()\
.in_streaming_mode()\
.use_blink_planner()\
.build()
t_env = StreamTableEnvironment.create(stream_execution_environment= env, environment_settings=settings)
source_ddl = f"""
CREATE TABLE source_table(
entry STRING
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SOURCE_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.isolation_level' = 'read_committed',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'scan.startup.mode' = 'earliest-offset',
'format' = 'raw'
)
"""
sink_ddl = f"""
CREATE TABLE sink_table(
entry STRING
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SINK_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.processing.mode' = 'exactly_once',
'properties.enable.idempotence' = 'true',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'format' = 'raw'
)
"""
t_env.execute_sql(source_ddl).wait()
t_env.execute_sql(sink_ddl).wait()
f = tbl_function()
table_udf = udtf(f, result_types=[DataTypes.STRING()])
t_env.create_temporary_function("table_f", table_udf)
table = t_env.from_path('source_table')
table = table.join_lateral('table_f(entry) as (content)')
table = table.select('content').alias('entry')
table.insert_into('sink_table')
from datetime import datetime
t_env.execute(f"dummy_test_{str(datetime.now())}")
if __name__ == '__main__':
pipeline()
jar dependencies:
added jar dep: /opt/flink/lib_py/flink-sql-connector-kafka_2.12-1.14.2.jar
added jar dep: /opt/flink/lib_py/flink-connector-kafka_2.12-1.14.2.jar
added jar dep: /opt/flink/lib_py/kafka-clients-2.4.1.jar
After a whole bunch of trial-error, and still not precisely knowing why this resolved the issue (or underlying pyflink issue?), I found that if you utilize a table meta-data field in your source definition, that would somehow initialize or synchronize your pipeline to produce a semantic.EXACTLY_ONCE data flow (1 record in = 1 record out, no duplicates).
The only change that I had to made is 1 line of meta data code in the DDL source definition. (Again providing my full script for reference):
import os
from pyflink.datastream.stream_execution_environment import StreamExecutionEnvironment
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, StreamTableEnvironment
from pyflink.table.udf import ScalarFunction, TableFunction, udf, udtf
from pyflink.datastream.checkpointing_mode import CheckpointingMode
KAFKA_SERVERS = os.getenv('KAFKA_BS_SERVERS',"localhost:9094").split(',')
KAFKA_USERNAME = "xxx"
KAFKA_PASSWORD = "_pass_"
KAFKA_SOURCE_TOPIC = 'source_topic'
KAFKA_SINK_TOPIC = 'sink_topic'
KAFKA_GROUP_ID = 'testgroup12'
JAR_DEPENDENCIES = os.getenv('JAR_DEPENDENCIES', '/opt/flink/lib_py')
class tbl_function(TableFunction):
def open(self, function_context):
pass
def eval(self, *args):
import json
from datetime import datetime
res = {
'time': str(datetime.utcnow()),
'input': json.loads(args[0])
}
yield json.dumps(res)
def pipeline():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
for file in os.listdir(JAR_DEPENDENCIES):
if file.find('.jar') != -1:
env.add_jars(f"file://{JAR_DEPENDENCIES}/{file}")
print(f"added jar dep: {JAR_DEPENDENCIES}/{file}")
env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_min_pause_between_checkpoints(120000)
env.get_checkpoint_config().enable_unaligned_checkpoints()
env.get_checkpoint_config().set_checkpoint_interval(30000)
settings = EnvironmentSettings.new_instance()\
.in_streaming_mode()\
.use_blink_planner()\
.build()
t_env = StreamTableEnvironment.create(stream_execution_environment= env, environment_settings=settings)
# this sneaky bugger line -> with 'event_time'
source_ddl = f"""
CREATE TABLE source_table(
entry STRING,
event_time TIMESTAMP(3) METADATA FROM 'timestamp'
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SOURCE_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.isolation_level' = 'read_committed',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'scan.startup.mode' = 'earliest-offset',
'format' = 'raw'
)
"""
sink_ddl = f"""
CREATE TABLE sink_table(
entry STRING
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SINK_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.processing.mode' = 'exactly_once',
'properties.enable.idempotence' = 'true',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'format' = 'raw'
)
"""
t_env.execute_sql(source_ddl).wait()
t_env.execute_sql(sink_ddl).wait()
f = tbl_function()
table_udf = udtf(f, result_types=[DataTypes.STRING()])
t_env.create_temporary_function("table_f", table_udf)
table = t_env.from_path('source_table')
table = table.join_lateral('table_f(entry) as (content)')
table = table.select('content').alias('entry')
table.insert_into('sink_table')
from datetime import datetime
t_env.execute(f"dummy_test_{str(datetime.now())}")
if __name__ == '__main__':
pipeline()
Hope this saves someone time, unlike the 3 days I spent in trial-error #Sigh :(

pyflink use kafka_source_ddl an error occurred

question:
i want to use pyflink read kafka msg
run commad./bin/flink run -m 10.0.24.13:8081 -py /usr/local/project/cdn_flink/cdn_demo.py ,show error:
File "/usr/local/flink-1.14.2/opt/python/pyflink.zip/pyflink/table/table.py", line 1108, in execute_insert
File "/usr/local/flink-1.14.2/opt/python/py4j-0.10.8.1-src.zip/py4j/java_gateway.py", line 1286, in __call__
File "/usr/local/flink-1.14.2/opt/python/pyflink.zip/pyflink/util/exceptions.py", line 158, in deco
pyflink.util.exceptions.TableException: org.apache.flink.table.api.TableException: findAndCreateTableSink failed.
at org.apache.flink.table.factories.TableFactoryUtil.findAndCreateTableSink(TableFactoryUtil.java:88)
at org.apache.flink.table.factories.TableFactoryUtil.lambda$findAndCreateTableSink$0(TableFactoryUtil.java:116)
at java.util.Optional.orElseGet(Optional.java:267)
at org.apache.flink.table.factories.TableFactoryUtil.findAndCreateTableSink(TableFactoryUtil.java:116)
at org.apache.flink.table.planner.delegation.PlannerBase.getTableSink(PlannerBase.scala:379)
at org.apache.flink.table.planner.delegation.PlannerBase.translateToRel(PlannerBase.scala:222)
at org.apache.flink.table.planner.delegation.PlannerBase$$anonfun$1.apply(PlannerBase.scala:182)
at org.apache.flink.table.planner.delegation.PlannerBase$$anonfun$1.apply(PlannerBase.scala:182)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.flink.table.planner.delegation.PlannerBase.translate(PlannerBase.scala:182)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.translate(TableEnvironmentImpl.java:1665)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeInternal(TableEnvironmentImpl.java:752)
at org.apache.flink.table.api.internal.TableImpl.executeInsert(TableImpl.java:574)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.table.api.TableException: Could not load service provider for table factories.
at org.apache.flink.table.factories.TableFactoryService.discoverFactories(TableFactoryService.java:212)
my env
flink-1.14.2
kafka_2.12-3.0.0
zookeeper-3.7.0
apache-flink 1.14.2
python 3.6.8
all process is running:
7600 StandaloneSessionClusterEntrypoint
13316 TaskManagerRunner # flink
23878 QuorumPeerMain # zk
15705 ConsoleProducer
29721 Jps
31454 Kafka
my code
cdn_connector_ddl.py
# --coding=utf8 --
kafka_source_ddl = """
CREATE TABLE cdn_access_log (
uuid VARCHAR,
client_ip VARCHAR,
request_time BIGINT,
response_size BIGINT
) WITH (
'connector' = 'kafka',
'topic' = 'cdn_access_log',
'properties.bootstrap.servers' = '10.0.24.13:9091',
'scan.startup.mode' = 'earliest-offset',
'format' = 'csv',
'csv.field-delimiter' = ','
)
"""
mysql_sink_ddl = """
CREATE TABLE cdn_access_statistic (
province VARCHAR,
access_count BIGINT,
total_download BIGINT,
download_speed DOUBLE
) WITH (
'connector.type' = 'jdbc',
'connector.url' = 'jdbc:mysql://localhost:3306/hive?autoReconnect=true&failOverReadOnly=false&useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=GMT%2B8',
'connector.table' = 'cdn_access_statistic',
'connector.username' = 'hive',
'connector.password' = 'hive1234',
'connector.write.flush.interval' = '1s'
)
cdn_demo.py
# --coding=utf8 --
import os
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, TableConfig
from cdn_connector_ddl import kafka_source_ddl, mysql_sink_ddl
def start():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
# add jar
_path = "file:////usr/local/lib64/python3.6/site-packages/pyflink/lib"
env.add_jars(os.path.join(_path, 'mysql-connector-java-8.0.27.jar'))
env.add_jars(os.path.join(_path, 'flink-sql-connector-kafka_2.12-1.14.2.jar'))
env.add_jars(os.path.join(_path, 'kafka-clients-3.0.0.jar'))
env.add_jars(os.path.join(_path, 'flink-csv-1.14.2-sql-jar.jar'))
env.add_jars(os.path.join(_path, 'flink-connector-kafka_2.12-1.14.2.jar'))
# t_env = StreamTableEnvironment.create(env, TableConfig())
t_env = StreamTableEnvironment.create(
env,
environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
# set source table
t_env.execute_sql(kafka_source_ddl)
t_env.execute_sql(mysql_sink_ddl)
t_env.from_path("cdn_access_log") \
.select("uuid, "
"client_ip as province, "
"response_size, request_time")\
.group_by("province")\
.select(
"province, count(uuid) as access_count, "
"sum(response_size) as total_download, "
"sum(response_size) * 1.0 / sum(request_time) as download_speed") \
.execute_insert("cdn_access_statistic")
t_env.execute("cdn_access_log")
if __name__=='__main__':
start()
i don't know how to solve,maybe use old flink version? pls help me,thanks
The error shows that it can't find suitable table sink
pyflink.util.exceptions.TableException: org.apache.flink.table.api.TableException: findAndCreateTableSink failed.
Two ideas for reference
Check if flink-connector-jdbc.jar is loaded,I see you just loaded mysql-connector-java-8.0.27.jar
Check jdbc connector option, don't use connector.xxx, may be you can reference to https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/connectors/table/jdbc/

pyflink configuration error SQL parse fail

I want to be sure that flink works in terms of setting and then try to complicate the usage. In the most simple example I tryed to do this stuff.
The input contains
column_a,column_b
1,2
The output exists.
In order to download the pyflink version of 1.10 using docker for my app I use the following snippet of code :
jobmanager:
image: pyflink/playgrounds:1.10.0
volumes:
- ./examples:/opt/examples
hostname: "jobmanager"
expose:
- "6123"
ports:
- "8088:8088"
command: jobmanager
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
taskmanager:
image: pyflink/playgrounds:1.10.0
volumes:
- ./examples:/opt/examples
expose:
- "6121"
- "6122"
depends_on:
- jobmanager
command: taskmanager
links:
- jobmanager:jobmanager
environment:
- |
FLINK_PROPERTIES=
jobmanager.rpc.address: jobmanager
taskmanager.numberOfTaskSlots: 2
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import StreamingFileSink
from pyflink.table import EnvironmentSettings, StreamTableEnvironment, BatchTableEnvironment
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.expressions import lit
import pandas as pd
from inspect import getmembers, isfunction
import os
# create a blink batch TableEnvironment
env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
table_env = BatchTableEnvironment.create(environment_settings=env_settings)
source_ddl = """
CREATE TABLE MyUserTable (
column_a INT PRIMARY KEY,
column_b INT,
) WITH (
'connector' = 'filesystem',
'path' = 'file:///Users//code/examples/input.csv ',
'format' = 'csv'
)"""
#connector for data output/sink
sink_ddl = """
CREATE TABLE table2 (
score INT)
WITH (
'connector' = 'filesystem',
'path' = 'file:///Users//code/examples/output.csv',
'format' = 'csv'
)"""
#make the table corresponding to the schema mentioned
source_table = table_env.execute_sql(source_ddl)
sink_table = table_env.execute_sql(sink_ddl)
#convert the sql table to table API
table_path = table_env.from_path("MyUserTable")
# execute SELECT statement
table_result2 = table_env.execute_sql("SELECT * FROM MyUserTable")
table_result2.print()
The error generated is the following
File ".\flink1.py", line 41, in <module>
source_table = table_env.execute_sql(source_ddl)
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\table\table_environment.py", line 804, in execute_sql
return TableResult(self._j_tenv.executeSql(stmt))
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\py4j\java_gateway.py", line 1285, in __call__
return_value = get_return_value(
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\util\exceptions.py",
line 147, in deco
return f(*a, **kw)
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\py4j\protocol.py", line 326,
in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o10.executeSql.
: org.apache.flink.table.api.SqlParserException: SQL parse failed. Encountered ")" at line 6, column 25.
Was expecting one of:
"CONSTRAINT" ...
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<HYPHENATED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
at org.apache.flink.table.planner.parse.CalciteParser.parse(CalciteParser.java:56)
at org.apache.flink.table.planner.delegation.ParserImpl.parse(ParserImpl.java:96)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.executeSql(TableEnvironmentImpl.java:722) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.calcite.sql.parser.SqlParseException: Encountered ")" at line 6, column 25.
Was expecting one of:
"CONSTRAINT" ...
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<HYPHENATED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.convertException(FlinkSqlParserImpl.java:450)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.normalizeException(FlinkSqlParserImpl.java:213)
at org.apache.calcite.sql.parser.SqlParser.handleException(SqlParser.java:140)
at org.apache.calcite.sql.parser.SqlParser.parseQuery(SqlParser.java:155)
at org.apache.calcite.sql.parser.SqlParser.parseStmt(SqlParser.java:180)
at org.apache.flink.table.planner.parse.CalciteParser.parse(CalciteParser.java:54)
... 13 more
Caused by: org.apache.flink.sql.parser.impl.ParseException: Encountered ")" at line 6, column 25.
Was expecting one of:
"CONSTRAINT" ...
"PRIMARY" ...
"UNIQUE" ...
"WATERMARK" ...
<BRACKET_QUOTED_IDENTIFIER> ...
<QUOTED_IDENTIFIER> ...
<BACK_QUOTED_IDENTIFIER> ...
<HYPHENATED_IDENTIFIER> ...
<IDENTIFIER> ...
<UNICODE_QUOTED_IDENTIFIER> ...
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.generateParseException(FlinkSqlParserImpl.java:39782)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.jj_consume_token(FlinkSqlParserImpl.java:39593)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.TableColumn(FlinkSqlParserImpl.java:4835)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlCreateTable(FlinkSqlParserImpl.java:5209)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlCreateExtended(FlinkSqlParserImpl.java:6233)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlCreate(FlinkSqlParserImpl.java:20934)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlStmt(FlinkSqlParserImpl.java:3415)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.SqlStmtEof(FlinkSqlParserImpl.java:3918)
at org.apache.flink.sql.parser.impl.FlinkSqlParserImpl.parseSqlStmtEof(FlinkSqlParserImpl.java:261)
at org.apache.calcite.sql.parser.SqlParser.parseQuery(SqlParser.java:153)
... 15 more
Remove the comma behind "column_b INT,".
This should work.

pyflink winerror 2 get_gateway()

Following a script on stackOverflow link as described by the link with the only change of the data in the sense I would like to evaluate the simple usage of pyflink in order to analyze if the installation of it works.You can see below the system and in order to install it I use pip -m install apache-flink into a virtual environment with python3.8.4. In order to install flink itself I used docker and I import the images of pyflink/playgrounds:1.10.0. Below is there the only the part taken from link that I modify
source_ddl = """
CREATE TABLE MyUserTable (
column_a INT,
column_b INT,
) WITH (
'connector' = 'filesystem',
'path' = 'file:///Users//code/examples/input.csv ',
'format' = 'csv'
)"""
#connector for data output/sink
sink_ddl = """
CREATE TABLE results (
score INT)
WITH (
'connector' = 'filesystem',
'path' = 'file:///Users//code/examples/output.csv',
'format' = 'csv'
)"""
#make the table corresponding to the schema mentioned
source_table = table_env.execute_sql(source_ddl)
sink_table = table_env.execute_sql(sink_ddl)
#convert the sql table to table API
table_path = table_env.from_path("MyUserTable")
# execute SELECT statement
table_result2 = table_env.execute_sql("SELECT column_a FROM MyUserTable")
table_result2.print()
The error is the following:
Traceback (most recent call last):
File ".\flink1.py", line 15, in <module>
env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\table\environment_settings.py", line 214, in new_instance
return EnvironmentSettings.Builder()
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\table\environment_settings.py", line 48, in __init__
gateway = get_gateway()
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\java_gateway.py", line 62, in get_gateway
_gateway = launch_gateway()
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\java_gateway.py", line 106, in launch_gateway
p = launch_gateway_server_process(env, args)
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\site-packages\pyflink\pyflink_gateway_server.py", line 221, in launch_gateway_server_process
return Popen(command, stdin=PIPE, preexec_fn=preexec_fn, env=env)
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\subprocess.py", line 854, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Users\landr\AppData\Local\Programs\Python\Python38\lib\subprocess.py", line 1307, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
FileNotFoundError: [WinError 2] Impossibile trovare il file specificato
I encountered this error today. I put a print statement into pyflink_gateway_server.py to show the command being passed. I found that the call was to a version of Java that no longer existed on my computer (I’d upgraded it).
The fix was to ensure that JAVA_HOME pointed to a valid Java installation.

Resources