I've had the scenario where I define a kafka source, UDF | UDTF for processing and sink to a Kafka sink. Doesn't matter what I do, if I run the job, the output is flood with the processed output of a single input record. For illustrative purposes, this is what's output on the defined kafka sink topic:
distinct timestamps, showing that the UDF is entered for each respective input record, but the same input record was processed:
By trying to figure out the problem I've read through whatever flink documentation I could find (and rabbit hole of links) in terms of enforcing 'semantic EXACTLY ONCE' processing of records. As far as I can gather it comes down to these following settings:
This guy presented the best visual representation for me to understand semantic_once_video
Kafka source (consumer)
kafka source property of isolation level = read_committed
Kafka sinks (producer)
Kafka sink property of processing mode = exactly_once
Kafka sink property of idempotence = true
Utilizing checkpointing
Also referencing stackoverflow questions I could find on the topic (mainly discussing in terms of Java implementations)... needless to say, still not resolved. Here's my code for reference:
import os
from pyflink.datastream.stream_execution_environment import StreamExecutionEnvironment
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, StreamTableEnvironment
from pyflink.table.udf import ScalarFunction, TableFunction, udf, udtf
from pyflink.datastream.checkpointing_mode import CheckpointingMode
KAFKA_SERVERS = os.getenv('KAFKA_BS_SERVERS',"localhost:9094").split(',')
KAFKA_USERNAME = "xxx"
KAFKA_PASSWORD = "_pass_"
KAFKA_SOURCE_TOPIC = 'source_topic'
KAFKA_SINK_TOPIC = 'sink_topic'
KAFKA_GROUP_ID = 'testgroup12'
JAR_DEPENDENCIES = os.getenv('JAR_DEPENDENCIES', '/opt/flink/lib_py')
class tbl_function(TableFunction):
def open(self, function_context):
pass
def eval(self, *args):
import json
from datetime import datetime
res = {
'time': str(datetime.utcnow()),
'input': json.loads(args[0])
}
yield json.dumps(res)
def pipeline():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
for file in os.listdir(JAR_DEPENDENCIES):
if file.find('.jar') != -1:
env.add_jars(f"file://{JAR_DEPENDENCIES}/{file}")
print(f"added jar dep: {JAR_DEPENDENCIES}/{file}")
env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_min_pause_between_checkpoints(120000)
env.get_checkpoint_config().enable_unaligned_checkpoints()
env.get_checkpoint_config().set_checkpoint_interval(30000)
settings = EnvironmentSettings.new_instance()\
.in_streaming_mode()\
.use_blink_planner()\
.build()
t_env = StreamTableEnvironment.create(stream_execution_environment= env, environment_settings=settings)
source_ddl = f"""
CREATE TABLE source_table(
entry STRING
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SOURCE_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.isolation_level' = 'read_committed',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'scan.startup.mode' = 'earliest-offset',
'format' = 'raw'
)
"""
sink_ddl = f"""
CREATE TABLE sink_table(
entry STRING
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SINK_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.processing.mode' = 'exactly_once',
'properties.enable.idempotence' = 'true',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'format' = 'raw'
)
"""
t_env.execute_sql(source_ddl).wait()
t_env.execute_sql(sink_ddl).wait()
f = tbl_function()
table_udf = udtf(f, result_types=[DataTypes.STRING()])
t_env.create_temporary_function("table_f", table_udf)
table = t_env.from_path('source_table')
table = table.join_lateral('table_f(entry) as (content)')
table = table.select('content').alias('entry')
table.insert_into('sink_table')
from datetime import datetime
t_env.execute(f"dummy_test_{str(datetime.now())}")
if __name__ == '__main__':
pipeline()
jar dependencies:
added jar dep: /opt/flink/lib_py/flink-sql-connector-kafka_2.12-1.14.2.jar
added jar dep: /opt/flink/lib_py/flink-connector-kafka_2.12-1.14.2.jar
added jar dep: /opt/flink/lib_py/kafka-clients-2.4.1.jar
After a whole bunch of trial-error, and still not precisely knowing why this resolved the issue (or underlying pyflink issue?), I found that if you utilize a table meta-data field in your source definition, that would somehow initialize or synchronize your pipeline to produce a semantic.EXACTLY_ONCE data flow (1 record in = 1 record out, no duplicates).
The only change that I had to made is 1 line of meta data code in the DDL source definition. (Again providing my full script for reference):
import os
from pyflink.datastream.stream_execution_environment import StreamExecutionEnvironment
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, StreamTableEnvironment
from pyflink.table.udf import ScalarFunction, TableFunction, udf, udtf
from pyflink.datastream.checkpointing_mode import CheckpointingMode
KAFKA_SERVERS = os.getenv('KAFKA_BS_SERVERS',"localhost:9094").split(',')
KAFKA_USERNAME = "xxx"
KAFKA_PASSWORD = "_pass_"
KAFKA_SOURCE_TOPIC = 'source_topic'
KAFKA_SINK_TOPIC = 'sink_topic'
KAFKA_GROUP_ID = 'testgroup12'
JAR_DEPENDENCIES = os.getenv('JAR_DEPENDENCIES', '/opt/flink/lib_py')
class tbl_function(TableFunction):
def open(self, function_context):
pass
def eval(self, *args):
import json
from datetime import datetime
res = {
'time': str(datetime.utcnow()),
'input': json.loads(args[0])
}
yield json.dumps(res)
def pipeline():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
for file in os.listdir(JAR_DEPENDENCIES):
if file.find('.jar') != -1:
env.add_jars(f"file://{JAR_DEPENDENCIES}/{file}")
print(f"added jar dep: {JAR_DEPENDENCIES}/{file}")
env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
env.get_checkpoint_config().set_min_pause_between_checkpoints(120000)
env.get_checkpoint_config().enable_unaligned_checkpoints()
env.get_checkpoint_config().set_checkpoint_interval(30000)
settings = EnvironmentSettings.new_instance()\
.in_streaming_mode()\
.use_blink_planner()\
.build()
t_env = StreamTableEnvironment.create(stream_execution_environment= env, environment_settings=settings)
# this sneaky bugger line -> with 'event_time'
source_ddl = f"""
CREATE TABLE source_table(
entry STRING,
event_time TIMESTAMP(3) METADATA FROM 'timestamp'
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SOURCE_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.isolation_level' = 'read_committed',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'scan.startup.mode' = 'earliest-offset',
'format' = 'raw'
)
"""
sink_ddl = f"""
CREATE TABLE sink_table(
entry STRING
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_SINK_TOPIC}',
'properties.bootstrap.servers' = '{','.join(KAFKA_SERVERS)}',
'properties.group.id' = '{KAFKA_GROUP_ID}',
'properties.processing.mode' = 'exactly_once',
'properties.enable.idempotence' = 'true',
'properties.sasl.mechanism' = 'PLAIN',
'properties.security.protocol' = 'SASL_PLAINTEXT',
'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"{KAFKA_USERNAME}\" password=\"{KAFKA_PASSWORD}\";',
'format' = 'raw'
)
"""
t_env.execute_sql(source_ddl).wait()
t_env.execute_sql(sink_ddl).wait()
f = tbl_function()
table_udf = udtf(f, result_types=[DataTypes.STRING()])
t_env.create_temporary_function("table_f", table_udf)
table = t_env.from_path('source_table')
table = table.join_lateral('table_f(entry) as (content)')
table = table.select('content').alias('entry')
table.insert_into('sink_table')
from datetime import datetime
t_env.execute(f"dummy_test_{str(datetime.now())}")
if __name__ == '__main__':
pipeline()
Hope this saves someone time, unlike the 3 days I spent in trial-error #Sigh :(
Related
I have the following Flink streaming application running locally, written with the SQL API:
object StreamingKafkaJsonsToCsvLocalFs {
val brokers = "localhost:9092"
val topic = "test-topic"
val consumerGroupId = "test-consumer"
val kafkaTableName = "KafKaTable"
val targetTable = "TargetCsv"
val targetPath = f"file://${new java.io.File(".").getCanonicalPath}/kafka-to-fs-csv"
def generateKafkaTableDDL(): String = {
s"""
|CREATE TABLE $kafkaTableName (
| `kafka_offset` BIGINT METADATA FROM 'offset',
| `seller_id` STRING
|) WITH (
| 'connector' = 'kafka',
| 'topic' = '$topic',
| 'properties.bootstrap.servers' = 'localhost:9092',
| 'properties.group.id' = '$consumerGroupId',
| 'scan.startup.mode' = 'earliest-offset',
| 'format' = 'json'
|)
|""".stripMargin
}
def generateTargetTableDDL(): String = {
s"""
|CREATE TABLE $targetTable (
| `kafka_offset` BIGINT,
| `seller_id` STRING
| )
|WITH (
| 'connector' = 'filesystem',
| 'path' = '$targetPath',
| 'format' = 'csv',
| 'sink.rolling-policy.rollover-interval' = '10 seconds',
| 'sink.rolling-policy.check-interval' = '1 seconds'
|)
|""".stripMargin
}
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
env.enableCheckpointing(1000)
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
env.getCheckpointConfig.setCheckpointStorage(s"$targetPath/checkpoints")
val settings = EnvironmentSettings.newInstance()
.inStreamingMode()
.build()
val tblEnv = StreamTableEnvironment.create(env, settings)
tblEnv.executeSql(generateKafkaTableDDL())
tblEnv.executeSql(generateTargetTableDDL())
tblEnv.from(kafkaTableName).executeInsert(targetTable).await()
tblEnv.executeSql("kafka-json-to-fs")
}
}
As you can see, the checkpointing is enabled and when I execute this application I see that the checkpoint folder is created and populated.
The problem that I am facing with is -- when I stop&start my application (from the IDE) I expect it to start from the same point it stopped in the previous execution but instead I see that it consumes all the offsets from the earliest offset in the topic (I see it from the new generated output files that contain zero offset although the previous run processed those offsets).
What am I missing about checkpointing in Flink? I would expect it to be exactly once.
Flink only restarts from a checkpoint when recovering from a failure, or when explicitly restarted from a retained checkpoint via the command line or REST API. Otherwise, the KafkaSource starts from the offsets configured in the code, which defaults to the earliest offsets.
If you have no other state, you could instead rely on the committed offsets as the source of truth, and configure the Kafka connector to use the committed offsets as the starting position.
Flink's fault tolerance via checkpointing isn't designed to support mini-cluster deployments like the one used when running in an IDE. Normally the job manager and task managers are running in separate processes, and the job manager can detect that a task manager has failed, and can arrange for a restart.
I am converting some legacy Java code written for Flink version 1.5 to Flink version 1.13.1. Specifically, I'm working with Table API. I have to read data from CSV file, perform some basic SQL and then write results back to a file.
For Flink version 1.5, I used the following code to perform above actions
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
BatchTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);
TableSource tableSrc = CsvTableSource.builder()
.path("<CSV_PATH>")
.fieldDelimiter(",")
.field("date", Types.STRING)
.field("month", Types.STRING)
...
.build();
tableEnv.registerTableSource("CatalogTable", tableSrc);
String sql = "...";
Table result = tableEnv.sqlQuery(sql);
DataSet<Row1> resultSet = tableEnv.toDataSet(result, Row1.class);
resultSet.writeAsText("<OUT_PATH>");
env.execute("Flink Table-Sql Example");
In order to convert above code to Flink version 1.13.1, I wrote the following code
import org.apache.flink.table.api.Table;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.BatchTableEnvironment;
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.inBatchMode()
.build();
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
TableEnvironment tableEnv = TableEnvironment.create(settings);
final String tableDDL = "CREATE TEMPORARY TABLE CatalogTable (" +
"date STRING, " +
"month STRING, " +
"..." +
") WITH (" +
"'connector' = 'filesystem', " +
"'path' = 'file:///CSV_PATH', " +
"'format' = 'csv'" +
")";
tableEnv.executeSql(tableDDL);
String sql = "...";
Table result = tableEnv.sqlQuery(sql);
// DEPRECATED - BatchTableEnvironment required to convert Table to Dataset
BatchTableEnvironment bTableEnv = BatchTableEnvironment.create(env);
DataSet<Row1> resultSet = bTableEnv.toDataSet(result, Row1.class);
resultSet.writeAsText("<OUT_PATH>");
env.execute("Flink Table-Sql Example");
However, BatchTableEnvironment is marked as "Deprecated" in Flink version 1.13. Is there any alternative to convert Table to Dataset or to directly write a Table to a file?
I am using glue job to write data pipeline. I took code from community, which is as following
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from py4j.java_gateway import java_import
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
#args = getResolvedOptions(sys.argv, ['JOB_NAME'])
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'URL', 'ACCOUNT', 'WAREHOUSE', 'DB', 'SCHEMA', 'USERNAME', 'PASSWORD', 'ROLE'])
sparkContext = SparkContext()
glueContext = GlueContext(sparkContext)
sparkSession = glueContext.spark_session
glueJob = Job(glueContext)
glueJob.init(args['JOB_NAME'], args)
##Use the CData JDBC driver to read Snowflake data from the Products table into a DataFrame
##Note the populated JDBC URL and driver class name
java_import(sparkSession._jvm, SNOWFLAKE_SOURCE_NAME)
sparkSession._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession(sparkSession._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
tmp_dir=args["TempDir"]
sfOptions = {
"sfURL" : args['URL'],
"sfAccount" : args['ACCOUNT'],
"sfUser" : args['USERNAME'],
"sfPassword" : args['PASSWORD'],
"sfDatabase" : args['DB'],
"sfSchema" : args['SCHEMA'],
"sfRole" : args['ROLE'],
"sfWarehouse" : args['WAREHOUSE'],
"preactions" : "USE DATABASE dev_lz;",
}
#"tempDir" : tmp_dir,
print('=========DB Connection details ================== ', sfOptions)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "aws-nonprod-datalake-glue-catalog", table_name = "nm_s_amaster", transformation_ctx = "datasource0")
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [ mappings], transformation_ctx = "applymapping1")
selectfields2 = SelectFields.apply(frame = applymapping1, paths = [columns], transformation_ctx = "selectfields2")
resolvechoice3 = ResolveChoice.apply(frame = selectfields2, choice = "MATCH_CATALOG", database = "aws-nonprod-datalake-glue-catalog", table_name = "NM_TEMP", transformation_ctx = "resolvechoice3")
resolvechoice4 = ResolveChoice.apply(frame = resolvechoice3, choice = "make_cols", transformation_ctx = "resolvechoice4")
##Convert DataFrames to AWS Glue's DynamicFrames Object
resolvechoice4.toDF().write.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("preactions","USE DATABASE dev_lz").option("dbtable", "nm_temp").mode("overwrite").save()
glueJob.commit()
But after running code i am getting
net.snowflake.client.jdbc.SnowflakeSQLException: SQL compilation error: Table 'NM_TEMP_STAGING_1100952600' does not exist
please let me know if I am missing anything.
I have permission for create, select stage, create, select table and create future tables.
above code I have removed columns and mappings. but original code it is available.
resolvechoice4.toDF().write.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("preactions","USE DATABASE dev_lz").option("dbtable", "nm_temp").mode("overwrite").save()
Added following in above dbtable option it started working,
.option("preactions","USE ROLE DEVELOPER;USE DATABASE dev_db;USE SCHEMA aws_test")
as following
resolvechoice4.toDF().write.format(SNOWFLAKE_SOURCE_NAME).options(**sfOptions).option("preactions","USE DATABASE dev_lz").option("preactions","USE ROLE DEVELOPER;USE DATABASE dev_db;USE SCHEMA aws_test").option("dbtable", "nm_temp").mode("overwrite").save()
I'm really new to Flume. I prefer Flume than Sqoop because data is continued to be imported to MS SQL Server in my case, therefore I think Flume is a better choice which is able to transfer data in real time.
I just followed some online example and then editing my own flume config file which tells something about the source, channel, and sink. However, it seemed that Flume didn't work successfully. There was no data being transferred to HBase.
mssql-hbase.conf
# source, channel, sink
agent1.sources = src1
agent1.channels = ch1
agent1.sinks = sk1
# declare source type
agent1.sources.src1.type = org.keedio.flume.source.SQLSource
agent1.sources.src1.hibernate.connection.url = jdbc:sqlserver://xx.xx.xx.xx:1433;DatabaseName=xxxx
agent1.sources.src1.hibernate.connection.user = xxxx
agent1.sources.src1.hibernate.connection.password = xxxx
agent1.sources.src1.table = xxxx
agent1.sources.src1.hibernate.connection.autocommit = true
# declare mysql hibernate dialect
agent1.sources.src1.hibernate.dialect = org.hibernate.dialect.SQLServerDialect
agent1.sources.src1.hibernate.connection.driver_class = com.microsoft.sqlserver.jdbc.SQLServerDriver
#agent1.sources.src1.hibernate.provider_class=org.hibernate.connection.C3P0ConnectionProvider
#agent1.sources.src1.columns.to.select = *
#agent1.sources.src1.incremental.column.name = PK, name, machine, time
#agent1.sources.src1.start.from=0
#agent1.sources.src1.incremental.value = 0
# query time interval
agent1.sources.src1.run.query.delay = 5000
# declare the folder loaction where flume state is saved
agent1.sources.src1.status.file.path = /home/user/flume-source-state
agent1.sources.src1.status.file.name = src1.status
agent1.sources.src1.batch.size = 1000
agent1.sources.src1.max.rows = 1000
agent1.sources.src1.delimiter.entry = |
# set the channel to memory mode
agent1.channels.ch1.type = memory
agent1.channels.ch1.capacity = 10000
agent1.channels.ch1.transactionCapacity = 10000
agent1.channels.ch1.byteCapacityBufferPercentage = 20
agent1.channels.ch1.byteCapacity = 800000
# declare sink type
agent1.sinks.sk1.type = org.apache.flume.sink.hbase.HBaseSink
agent1.sinks.sk1.table = yyyy
agent1.sinks.sk1.columnFamily = yyyy
agent1.sinks.sk1.hdfs.batchSize = 100
agent1.sinks.sk1.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
agent1.sinks.sk1.serializer.regex = ^\"(.*?)\",\"(.*?)\",\"(.*?)\"$
agent1.sinks.sk1.serializer.colNames = PK, name, machine, time
# bind source, channel, sink
agent1.sources.src1.channels = ch1
agent1.sinks.sk1.channel = ch1
But, I use a similar config file to transfer data from MySql to HBase. Luckily, it worked.
mysql-hbase.conf
# source, channel, sink
agent1.sources = src1
agent1.channels = ch1
agent1.sinks = sk1
# declare source type
agent1.sources.src1.type = org.keedio.flume.source.SQLSource
agent1.sources.src1.hibernate.connection.url = jdbc:mysql://xxxx:3306/userdb
agent1.sources.src1.hibernate.connection.user = xxxx
agent1.sources.src1.hibernate.connection.password = xxxx
agent1.sources.src1.table = xxxx
agent1.sources.src1.hibernate.connection.autocommit = true
# declare mysql hibernate dialect
agent1.sources.src1.hibernate.dialect = org.hibernate.dialect.MySQL5Dialect
agent1.sources.src1.hibernate.connection.driver_class = com.mysql.jdbc.Driver
#agent1.sources.src1.hibernate.provider_class=org.hibernate.connection.C3P0ConnectionProvider
#agent1.sources.src1.columns.to.select = *
#agent1.sources.src1.incremental.column.name = id
#agent1.sources.src1.incremental.value = 0
# query time interval
agent1.sources.src1.run.query.delay = 5000
# declare the folder loaction where flume state is saved
agent1.sources.src1.status.file.path = /home/user/flume-source-state
agent1.sources.src1.status.file.name = src1.status
#agent1.sources.src1.interceptors=i1
#agent1.sources.src1.interceptors.i1.type=search_replace
#agent1.sources.src1.interceptors.i1.searchPattern="
#agent1.sources.src1.interceptors.i1.replaceString=,
# Set the channel to memory mode
agent1.channels.ch1.type = memory
agent1.channels.ch1.capacity = 10000
agent1.channels.ch1.transactionCapacity = 10000
agent1.channels.ch1.byteCapacityBufferPercentage = 20
agent1.channels.ch1.byteCapacity = 800000
# declare sink type
agent1.sinks.sk1.type = org.apache.flume.sink.hbase.HBaseSink
agent1.sinks.sk1.table = user_test_2
agent1.sinks.sk1.columnFamily = user_hobby
agent1.sinks.sk1.hdfs.batchSize = 100
agent1.sinks.sk1.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
agent1.sinks.sk1.serializer.regex = ^\"(.*?)\",\"(.*?)\",\"(.*?)\",\"(.*?)\"$
agent1.sinks.sk1.serializer.colNames = id,name,age,hobby
# bind source, channel, sink
agent1.sources.src1.channels = ch1
agent1.sinks.sk1.channel = ch1
Does anyone know is there something wrong in the config file? Thanks.
I want to add some extra information into the exists abc file or if its possible while creating alembic cache with some extra information in maya or any cg application using pyhon.
I am appreciate any one can help me to edit the alembic file.
input example
meta_data = {'name': 'Hero', 'tag': 'test_show'}
abc_file = '/show/test_show/scene/hero.abc'
set meta data ?
from alembic import Abc
get meta data
from alembic import Abc
archive = Abc.IArchive(abc_file)
top = archive.getTop()
meta_data = top.getMetaData()
print meta_data__str()
Here's a complete script that does the job of copying the source alembic file and inserting some metadata:
import os
import alembic
def copy_props(i_props, o_props):
'''
Copy properties
'''
for index in range(i_props.getNumProperties()):
header = i_props.getPropertyHeader(index)
if header.isArray():
i_prop = alembic.Abc.IArrayProperty(
i_props,
header.getName())
prop_name = i_prop.getName()
prop_meta = i_prop.getMetaData()
o_prop = alembic.Abc.OArrayProperty(
o_props,
prop_name,
i_prop.getDataType(),
prop_meta,
0)
o_prop.setTimeSampling(i_prop.getTimeSampling())
for i in range(i_prop.getNumSamples()):
o_prop.setValue(i_prop.getValue(i))
elif header.isScalar():
i_prop = alembic.Abc.IScalarProperty(
i_props,
header.getName())
prop_name = i_prop.getName()
prop_meta = i_prop.getMetaData()
o_prop = alembic.Abc.OScalarProperty(
o_props,
prop_name,
i_prop.getDataType(),
prop_meta,
0)
o_prop.setTimeSampling(i_prop.getTimeSampling())
for i in range(i_prop.getNumSamples()):
o_prop.setValue(i_prop.getValue(i))
elif header.isCompound():
i_prop = alembic.Abc.ICompoundProperty(
i_props,
header.getName())
prop_name = i_prop.getName()
prop_meta = i_prop.getMetaData()
o_prop = alembic.Abc.OCompoundProperty(
o_props,
prop_name,
prop_meta)
copy_props(i_prop, o_prop)
def copy_object(i_obj, o_obj):
'''
Recursively copy object data
'''
if o_obj is None:
return
i_props = i_obj.getProperties()
o_props = o_obj.getProperties()
copy_props(i_props, o_props)
for index in range(i_obj.getNumChildren()):
i_child = i_obj.getChild(index)
i_child_name = i_child.getName()
i_child_meta = i_child.getMetaData()
o_child = alembic.Abc.OObject(o_obj, i_child_name, i_child_meta)
copy_object(i_child, o_child)
def copy_abc(i_path, o_path, app, description):
'''
Copy alembic file from i_path to o_path
'''
arc_in = alembic.Abc.IArchive(i_path)
arc_out = alembic.Abc.OArchive(o_path, asOgawa=True)
arc_out = alembic.Abc.CreateArchiveWithInfo(o_path, app, description)
top_in = arc_in.getTop()
top_out = arc_out.getTop()
copy_object(top_in, top_out)
def read(abc_file):
archive = alembic.Abc.IArchive(abc_file)
return alembic.Abc.GetArchiveInfo(archive)
if __name__ == '__main__':
i_path = os.path.join(
os.path.dirname(__file__),
'itest.abc'
)
o_path = os.path.join(
os.path.dirname(__file__),
'otest.abc'
)
copy_abc(i_path, o_path, 'Cool app', 'Cool description')
print('Created archive: ' + o_path)
archive_info = read(o_path)
print('App name: ' + archive_info.get('appName'))
print('Description: ' + archive_info.get('userDescription'))
print('Written: ' + archive_info.get('whenWritten'))
You can't write just arbitrary data but you can set description and application strings:
from alembic import Abc
MY_APP = 'My cool application'
def write(abc_file, description):
archive = Abc.CreateArchiveWithInfo(abc_file, MY_APP, description)
def read(abc_file):
archive = Abc.IArchive(abc_file)
top = archive.getTop()
return Abc.GetArchiveInfo(archive)
abc_file = 'alembic.abc'
write(abc_file, 'An abc file cool description')
archive_info = read(abc_file)
print(archive_info.get('appName'))
print(archive_info.get('userDescription'))
print(archive_info.get('whenWritten'))