I am currently deploying Azure Data Factory IaC with Terraform and DevOps Pipelines. When trying to deploy a new Delimited Text Dataset, I run into the following error:
│ Error: Unsupported block type
│
│ on ds_test.tf line 7, in resource “azurerm_data_factory_dataset_delimited_text” “test_dataset”:
│ 7: azure_blob_fs_location {
│
│ Blocks of type “azure_blob_fs_location” are not expected here.
│
##[error]Bash exited with code ‘1’.
This is my .tf file:
resource "azurerm_data_factory_dataset_delimited_text" "test_dataset" {
name = "test_dataset"
resource_group_name = "test-rsg"
data_factory_name = "test-adf"
linked_service_name = "AzureDataLakeStorage1"
azure_blob_fs_location {
file_system = "csv-dump-demo"
path = ""
filename = "personal_customer_data.csv"
}
column_delimiter = ","
row_delimiter = "\r\n"
encoding = "UTF-8"
quote_character = "\""
escape_character = "\\"
first_row_as_header = true
null_value = "NULL"
}
The Terraform documentation for Delimited Text Dataset states, that exactly one of the following location blocks need to be defined, in order to make the Dataset work:
azure_blob_fs_location
azure_blob_storage_location
http_server_location
Why is Terraform plan telling me that it is a unsupported block type? Am I missing something?
Seems like the Terraform documentation is deprecated because when I removed the block and tried to deploy the Data Set again, Terraform apply gave me the following output:
│ Error: One of http_server_location, azure_blob_storage_location must be specified to create a DataFactory Delimited Text Dataset
After I tried it with the use of azure_blob_storage_location instead of azure_blob_fs_location it worked. Maybe there are only two location blocks available anymore and the documentation is not up to date.
I have a local cluster running in Minikube. My pipeline job is written in python and is a basic consumer of Kafka. My pipeline looks as follows:
def run():
import apache_beam as beam
options = PipelineOptions([
"--runner=FlinkRunner",
"--flink_version=1.10",
"--flink_master=localhost:8081",
"--environment_type=EXTERNAL",
"--environment_config=localhost:50000",
"--streaming",
"--flink_submit_uber_jar"
])
options.view_as(SetupOptions).save_main_session = True
options.view_as(StandardOptions).streaming = True
with beam.Pipeline(options=options) as p:
(p
| 'Create words' >> ReadFromKafka(
topics=['mullerstreamer'],
consumer_config={
'bootstrap.servers': '192.168.49.1:9092,192.168.49.1:9093',
'auto.offset.reset': 'earliest',
'enable.auto.commit': 'true',
'group.id': 'BEAM-local'
}
)
| 'print' >> beam.Map(print)
)
if __name__ == "__main__":
run()
The Flink runner shows no records passing through in "Records received"
Am I missing something basic?
--environment_type=EXTERNAL means you are starting up the workers manually, and is primarily for internal testing. Does it work if you don't specify an environment_type/config at all?
def run(bootstrap_servers, topic, pipeline_args):
bootstrap_servers = 'localhost:9092'
topic = 'wordcount'
pipeline_args = pipeline_args.append('--flink_submit_uber_jar')
pipeline_options = PipelineOptions([
"--runner=FlinkRunner",
"--flink_master=localhost:8081",
"--flink_version=1.12",
pipeline_args
],
save_main_session=True, streaming=True)
with beam.Pipeline(options=pipeline_options) as pipeline:
_ = (
pipeline
| ReadFromKafka(
consumer_config={'bootstrap.servers': bootstrap_servers},
topics=[topic])
| beam.FlatMap(lambda kv: log_ride(kv[1])))
I'm facing another issue with latest apache Beam 2.30.0, Flink 1.12.4
2021/06/10 17:39:42 Initializing python harness: /opt/apache/beam/boot --id=1-2 --provision_endpoint=localhost:42353
2021/06/10 17:39:50 Failed to retrieve staged files: failed to retrieve /tmp/staged in 3 attempts: failed to retrieve chunk for /tmp/staged/pickled_main_session
caused by:
rpc error: code = Unknown desc = ; failed to retrieve chunk for /tmp/staged/pickled_main_session
caused by:
rpc error: code = Unknown desc = ; failed to retrieve chunk for /tmp/staged/pickled_main_session
caused by:
rpc error: code = Unknown desc = ; failed to retrieve chunk for /tmp/staged/pickled_main_session
caused by:
rpc error: code = Unknown desc =
2021-06-10 17:39:53,076 WARN org.apache.flink.runtime.taskmanager.Task [] - [3]ReadFromKafka(beam:external:java:kafka:read:v1)/{KafkaIO.Read, Remove Kafka Metadata} -> [1]FlatMap(<lambda at kafka-taxi.py:88>) (1/1)#0 (9d941b13ae9f28fd1460bc242b7f6cc9) switched from RUNNING to FAILED.
org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.UncheckedExecutionException: java.lang.IllegalStateException: No container running for id d727ca3c0690d949f9ed1da9c3435b3ab3af70b6b422dc82905eed2f74ec7a15
While playing around with Flink, I have been trying to upsert data into Elasticsearch. I'm having this error on my STDOUT:
Caused by: org.apache.flink.table.api.NoMatchingTableFactoryException: Could not find a suitable table factory for 'org.apache.flink.table.factories.TableSinkFactory' in
the classpath.
Reason: Required context properties mismatch.
The following properties are requested:
connector.hosts=http://elasticsearch-elasticsearch-coordinating-only.default.svc.cluster.local:9200
connector.index=transfers-sum
connector.key-null-literal=n/a
connector.property-version=1
connector.type=elasticsearch
connector.version=6
format.json-schema={ \"curr_careUnit\": {\"type\": \"text\"}, \"sum\": {\"type\": \"float\"} }
format.property-version=1
format.type=json
schema.0.data-type=VARCHAR(2147483647)
schema.0.name=curr_careUnit
schema.1.data-type=FLOAT
schema.1.name=sum
update-mode=upsert
The following factories have been considered:
org.apache.flink.streaming.connectors.kafka.Kafka09TableSourceSinkFactory
org.apache.flink.table.sinks.CsvBatchTableSinkFactory
org.apache.flink.table.sinks.CsvAppendTableSinkFactory
at org.apache.flink.table.factories.TableFactoryService.filterByContext(TableFactoryService.java:322)
...
Here is what I have in my scala Flink code:
def main(args: Array[String]) {
// Create streaming execution environment
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
// Set properties per KafkaConsumer API
val properties = new Properties()
properties.setProperty("bootstrap.servers", "kafka.kafka:9092")
properties.setProperty("group.id", "test")
// Add Kafka source to environment
val myKConsumer = new FlinkKafkaConsumer010[String]("raw.data4", new SimpleStringSchema(), properties)
// Read from beginning of topic
myKConsumer.setStartFromEarliest()
val streamSource = env
.addSource(myKConsumer)
// Transform CSV (with a header row per Kafka event into a Transfers object
val streamTransfers = streamSource.map(new TransfersMapper())
// create a TableEnvironment
val tEnv = StreamTableEnvironment.create(env)
// register a Table
val tblTransfers: Table = tEnv.fromDataStream(streamTransfers)
tEnv.createTemporaryView("transfers", tblTransfers)
tEnv.connect(
new Elasticsearch()
.version("6")
.host("elasticsearch-elasticsearch-coordinating-only.default.svc.cluster.local", 9200, "http")
.index("transfers-sum")
.keyNullLiteral("n/a")
.withFormat(new Json().jsonSchema("{ \"curr_careUnit\": {\"type\": \"text\"}, \"sum\": {\"type\": \"float\"} }"))
.withSchema(new Schema()
.field("curr_careUnit", DataTypes.STRING())
.field("sum", DataTypes.FLOAT())
)
.inUpsertMode()
.createTemporaryTable("transfersSum")
val result = tEnv.sqlQuery(
"""
|SELECT curr_careUnit, sum(los)
|FROM transfers
|GROUP BY curr_careUnit
|""".stripMargin)
result.insertInto("transfersSum")
env.execute("Flink Streaming Demo Dump to Elasticsearch")
}
}
I am creating a fat jar and uploading it to my remote flink instance. Here is my build.gradle dependencies:
compile 'org.scala-lang:scala-library:2.11.12'
compile 'org.apache.flink:flink-scala_2.11:1.10.0'
compile 'org.apache.flink:flink-streaming-scala_2.11:1.10.0'
compile 'org.apache.flink:flink-connector-kafka-0.10_2.11:1.10.0'
compile 'org.apache.flink:flink-table-api-scala-bridge_2.11:1.10.0'
compile 'org.apache.flink:flink-connector-elasticsearch6_2.11:1.10.0'
compile 'org.apache.flink:flink-json:1.10.0'
compile 'com.fasterxml.jackson.core:jackson-core:2.10.1'
compile 'com.fasterxml.jackson.module:jackson-module-scala_2.11:2.10.1'
compile 'org.json4s:json4s-jackson_2.11:3.7.0-M1'
Here is how the farJar command is built for gradle:
jar {
from {
(configurations.compile).collect {
it.isDirectory() ? it : zipTree(it)
}
}
manifest {
attributes("Main-Class": "main" )
}
}
task fatJar(type: Jar) {
zip64 true
manifest {
attributes 'Main-Class': "flinkNamePull.Demo"
}
baseName = "${rootProject.name}"
from { configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } }
with jar
}
Could anybody please help me to see what I am missing? I'm fairly new to Flink and data streaming in general. Hehe
Thank you in advance!
Is the list in The following factories have been considered: complete? Does it contain Elasticsearch6UpsertTableSinkFactory? If not as far as I can tell there is a problem with the service discovery dependencies.
How do you submit your job? Can you check if you have a file META-INF/services/org.apache.flink.table.factories.TableFactory in the uber jar with an entry for Elasticsearch6UpsertTableSinkFactory?
When using maven you have to add a transformer to properly merge service files:
<!-- The service transformer is needed to merge META-INF/services files -->
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
I don't know how do you do it in gradle.
EDIT:
Thanks to Arvid Heise
In gradle when using shadowJar plugin you can merge service files via:
// Merging Service Files
shadowJar {
mergeServiceFiles()
}
You should use the shadow plugin to create the fat jar instead of doing it manually.
In particular, you want to merge service descriptors.
I am new in Play Framework (Scala) and need some advise.
I use Scala 2.12 and Play Framework 2.6.20. I need to use several databases in my project. Right now I connected MySQL database as it says in documentation. How correctly connect project to remote Oracle 12g database?
application.conf:
db {
mysql.driver = com.mysql.cj.jdbc.Driver
mysql.url = "jdbc:mysql://host:port/database?characterEncoding=UTF-8"
mysql.username = "username"
mysql.password = "password"
}
First of all to lib folder I put ojdbc8.jar file from oracle website.
Then add libraryDependencies += "com.oracle" % "ojdbc8" % "12.1.0.1" code to sbt.build file. Finally I wrote settings to aplication.conf file.
After that step I notice error in terminal:
[error] (*:update) sbt.ResolveException: unresolved dependency: com.oracle#ojdbc8;12.1.0.1: not found
[error] Total time: 6 s, completed 10.11.2018 16:48:30
Java HotSpot(TM) 64-Bit Server VM warning: ignoring option MaxPermSize=256M; support was removed in 8.0
EDIT:
application.conf:
db {
mysql.driver = com.mysql.cj.jdbc.Driver
mysql.url = "jdbc:mysql://#host:#port/#database?characterEncoding=UTF-8"
mysql.username = "#username"
mysql.password = "#password"
oracle.driver = oracle.jdbc.driver.OracleDriver
oracle.url = "jdbc:oracle:thin:#host:#port/#sid"
oracle.username = "#username"
oracle.password = "#password"
}
ERROR:
play.api.UnexpectedException: Unexpected exception[CreationException: Unable to create injector, see the following errors:
1) No implementation for play.api.db.Database was bound.
while locating play.api.db.Database
for the 1st parameter of controllers.GetMarkersController.<init>(GetMarkersController.scala:14)
while locating controllers.GetMarkersController
for the 7th parameter of router.Routes.<init>(Routes.scala:45)
at play.api.inject.RoutesProvider$.bindingsFromConfiguration(BuiltinModule.scala:121):
Binding(class router.Routes to self) (via modules: com.google.inject.util.Modules$OverrideModule -> play.api.inject.guice.GuiceableModuleConversions$$anon$1)
GetMarkersController.scala:
package controllers
import javax.inject._
import akka.actor.ActorSystem
import play.api.Configuration
import play.api.mvc.{AbstractController, ControllerComponents}
import play.api.libs.ws._
import scala.concurrent.duration._
import scala.concurrent.{ExecutionContext, Future, Promise}
import services._
import play.api.db.Database
class GetMarkersController #Inject()(db: Database, conf: Configuration, ws: WSClient, cc: ControllerComponents, actorSystem: ActorSystem)(implicit exec: ExecutionContext) extends AbstractController(cc) {
def getMarkersValues(start_date: String, end_date: String) = Action.async {
getValues(1.second, start_date: String, end_date: String).map {
message => Ok(message)
}
}
private def getValues(delayTime: FiniteDuration, start_date: String, end_date: String): Future[String] = {
val promise: Promise[String] = Promise[String]()
val service: GetMarkersService = new GetMarkersService(db)
actorSystem.scheduler.scheduleOnce(delayTime) {
promise.success(service.get_markers(start_date, end_date))
}(actorSystem.dispatcher)
promise.future
}
}
You cannot access Oracle without credentials. You need to have an account with Oracle. Then add something like the following to your build.sbt file
resolvers += "Oracle" at "https://maven.oracle.com"
credentials += Credentials("Oracle", "maven.oracle.com", "username", "password")
More information about accessing the OTN: https://docs.oracle.com/middleware/1213/core/MAVEN/config_maven_repo.htm#MAVEN9012
If you have the hard coded jar, you don't need to include as a dependency. See unmanagedDependencies https://www.scala-sbt.org/1.x/docs/Library-Dependencies.html
I am trying to create files in a Dropbox.com folder from a GAE application.
I have done all the steps the register a Dropbox application and installed the Python SDK from Dropbox locally on my development machine. (see dropbox.com API).
It all works perfectly when I use the cli_client.py test script in the dropbox SDK on my local machine to access dropbox - can 'put' files etc.
I now want to start working in GAE environment, so things get a bit tricky.
Some help would be useful.
For those familiar with the Dropbox API code, I had the following issues thus far:
Issue 1
The rest.py Dropbox API module uses pkg_resources to get the certs installed in site-packages of a local machine installation.
I replaced
TRUSTED_CERT_FILE = pkg_resources.resource_filename(__name__, 'trusted-certs.crt')
with
TRUSTED_CERT_FILE = file('trusted-certs.crt')
and placed the cert file in my GAE application directory. Perhaps this is not quite right; see my authentication error code below.
Issue 2
The session.py Dropbox API module uses oauth module, so I changed the include to appengine oauth.
But raised an exception that GAE's oauth does not have OAuthConsumer method used by the Dropbox session.py module. So i downloaded oauth 1.0 and added to my application an now import this instead of GAE oauth.
Issue 3
GAE ssl module does not seem to have CERT_REQUIRED property.
This is a constant, so I changed
self.cert_reqs = ssl.CERT_REQUIRED
to
self.cert_reqs = 2
This is used when calling
ssl.wrap_socket(sock, cert_reqs=self.cert_reqs, ca_certs=self.ca_certs)
Authentication Error
But I still can't connect to Dropbox:
Status: 401
Reason: Unauthorized
Body: {"error": "Authentication failed"}
Headers: [('date', 'Sun, 19 Feb 2012 15:11:12 GMT'), ('transfer-encoding', 'chunked'), ('connection', 'keep-alive'), ('content-type', 'application/json'), ('server', 'dbws')]
Here's my patched version of Dropbox Python SDK 1.4 which works well for me with Python 2.7 GAE: dropbox_python_sdk_gae_patched.7z.base64. No extra third-party libraries needed, only those provided by GAE environment.
Only file uploading (put_file) is tested. Here're setup steps:
Unpack archive to the root folder of GAE application (if main app is in the root folder). You can decode BASE64 using Base64 Encoder/Decoder: base64.exe -d dropbox_python_sdk_gae_patched.7z.base64 dropbox_python_sdk_gae_patched.7z.
Setup APP_KEY, APP_SECRET, ACCESS_TYPE, ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET. First three are configured at dropbox application creation time. Last two are obtained when granting application access to specific dropbox account, you can get them through cli_client.py (from DB Python SDK) from token_store.txt file.
Use in the code like this:
import dropbox
# ...
def DropboxUpload(path, data):
sess = dropbox.session.DropboxSession(APP_KEY, APP_SECRET, ACCESS_TYPE)
sess.set_token(ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET)
cli = dropbox.client.DropboxClient(sess)
data_file = StringIO.StringIO(data)
return cli.put_file(path, data_file)
# ...
import json
class DropboxUploadHandlerExample(webapp2.RequestHandler):
def get(self):
url = "http://www.google.com/"
result = urlfetch.fetch(url)
self.response.headers['Content-Type'] = 'application/json'
self.response.out.write(json.dumps(DropboxUpload('/fetch_result.dat', result.content)))
I successfully uploaded from Google Appengine to Dropbox with my own patched version
of the Dropbox SDK: https://github.com/cklein/dropbox-client-python
The usage of urllib2 was replaced by huTools.http: https://github.com/hudora/huTools/
This is the code that is called in a request handler:
db_client = dropbox.get_dropbox_client(consumer_key='', consumer_secret='', access_token_key='', access_token_secret='')
fileobj = StringIO.StringIO(data)
path = '/some/path/filename'
resp = db_client.put_file(path, fileobj)
fileobj.close()
As of April 2016, none of the other suggestions work. (Dropbox API version 2, Python SDK version 6.2).
If you only need a few of the SDK functions, I found it easiest to just use the HTTP API directly:
def files_upload(f, path, mode='add', autorename=False, mute=False):
args = {
'path': path,
'mode': mode,
'autorename': autorename,
'mute': mute,
}
headers = {
'Authorization': 'Bearer {}'.format(ACCESS_TOKEN),
'Dropbox-API-Arg': json.dumps(args),
'Content-Type': 'application/octet-stream',
}
request = urllib2.Request('https://content.dropboxapi.com/2/files/upload', f, headers=headers)
r = urllib2.urlopen(request)
I have patched the Dropbox Python SDK version 2.2 to work on Google App Engine. Please find the relevant code here:
https://github.com/duncanhawthorne/gae-dropbox-python
The relevant code patch (copied from github) for rest.py is here:
import io
import pkg_resources
-import socket
+#import socket
import ssl
import sys
import urllib
+import urllib2
+def mock_urlopen(method,url,body,headers,preload_content):
+ request = urllib2.Request(url, body, headers=headers)
+ r = urllib2.urlopen(request)
+ return r
+
try:
import json
except ImportError:
## -23,7 +29,10 ##
SDK_VERSION = "2.2.0"
-TRUSTED_CERT_FILE = pkg_resources.resource_filename(__name__, 'trusted-certs.crt')
+try:
+ TRUSTED_CERT_FILE = pkg_resources.resource_filename(__name__, 'trusted-certs.crt')
+except:
+ TRUSTED_CERT_FILE = file('trusted-certs.crt')
class RESTResponse(io.IOBase):
## -125,6 +134,7 ## def flush(self):
pass
def create_connection(address):
+ return
host, port = address
err = None
for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
## -152,7 +162,7 ## def json_loadb(data):
class RESTClientObject(object):
- def __init__(self, max_reusable_connections=8, mock_urlopen=None):
+ def __init__(self, max_reusable_connections=8, mock_urlopen=mock_urlopen):
"""
Parameters
max_reusable_connections
## -206,7 +216,7 ## def request(self, method, url, post_params=None, body=None, headers=None, raw_re
raise ValueError("headers should not contain newlines (%s: %s)" %
(key, value))
- try:
+ if True:
# Grab a connection from the pool to make the request.
# We return it to the pool when caller close() the response
urlopen = self.mock_urlopen if self.mock_urlopen else self.pool_manager.urlopen
## -217,14 +227,14 ## def request(self, method, url, post_params=None, body=None, headers=None, raw_re
headers=headers,
preload_content=False
)
- r = RESTResponse(r) # wrap up the urllib3 response before proceeding
- except socket.error as e:
- raise RESTSocketError(url, e)
- except urllib3.exceptions.SSLError as e:
- raise RESTSocketError(url, "SSL certificate error: %s" % e)
+ #r = RESTResponse(r) # wrap up the urllib3 response before proceeding
+ #except socket.error as e:
+ # raise RESTSocketError(url, e)
+ #except urllib3.exceptions.SSLError as e:
+ # raise RESTSocketError(url, "SSL certificate error: %s" % e)
- if r.status not in (200, 206):
- raise ErrorResponse(r, r.read())
+ #if r.status not in (200, 206):
+ # raise ErrorResponse(r, r.read())
return self.process_response(r, raw_response)
## -321,10 +331,11 ## def PUT(cls, *n, **kw):
return cls.IMPL.PUT(*n, **kw)
-class RESTSocketError(socket.error):
+class RESTSocketError():
"""A light wrapper for ``socket.error`` that adds some more information."""
def __init__(self, host, e):
+ return
msg = "Error connecting to \"%s\": %s" % (host, str(e))
socket.error.__init__(self, msg)