build.sbt
name := "BigData"
version := "0.1"
scalaVersion := "2.12.7"
libraryDependencies += "com.github.tototoshi" %% "scala-csv" % "1.3.5"
// https://mvnrepository.com/artifact/org.apache.spark/spark-core
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.0"
// https://mvnrepository.com/artifact/org.apache.spark/spark-sql
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.0"
// https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc
libraryDependencies += "com.microsoft.sqlserver" % "mssql-jdbc" % "6.1.0.jre8" % Test
SparkMSSQL.scala
import org.apache.spark.sql.SparkSession
object SparkMSSQL {
def main(args: Array[String]): Unit = {
val spark = SparkSession.
builder.master("local[*]")
.appName("Simple Application")
.getOrCreate()
val url = "jdbc:sqlserver://localhost;databaseName=scalatest;integratedSecurity=true";
// Define database table to load into DataFrame
val jdbcDbTable = "dbo.user_profiles"
val df = spark
.read
.format("jdbc")
.option("url", url)
.option("dbtable", "dbo.user_profiles")
.load()
df.printSchema()
}
}
Error after compile
Exception in thread "main" java.sql.SQLException: No suitable driver
at java.sql.DriverManager.getDriver(DriverManager.java:315)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$2(JDBCOptions.scala:105)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:105)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:35)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
Please advice , what is wrong with my code?
First, you have your jdbc driver in the test scope, so the jar is probably not loaded at runtime. But also, spark needs driver class information to create JDBC connection, so try adding the following option to the DF initializer:
.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
Related
I want to run locally SQL expressions over datastreams from Kafka, Kinesis, etc.
I've tried running the following code, which basically creates a datastream source from kafka, and registers it as a table, then I run a select * on it, and get the result back as a datastream. CollectSink is just a utility sink I have to be able to debug those messages.
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.createLocalEnvironment()
val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, EnvironmentSettings.inStreamingMode())
val source = KafkaSource.builder<String>()
.setBootstrapServers(defaultKafkaProperties["bootstrap.servers"] as String)
.setTopics(topicName)
.setGroupId("my-group-test" + Math.random())
.setStartingOffsets(OffsetsInitializer.earliest())
.setValueOnlyDeserializer(SimpleStringSchema())
.build()
val stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), "Kafka Source")
val table = tableEnv.fromDataStream(stream)
tableEnv.createTemporaryView("Source", table);
val resultTable = tableEnv.sqlQuery("select * from Source")
val resultStream = tableEnv.toDataStream(resultTable)
resultStream.addSink(CollectSink())
env.execute()
I'm always getting the following error, and I don't know why, as I'm not using scala in my application code, so I assume I'm missing a dependency somewhere?
java.lang.NoClassDefFoundError: scala/Serializable
at java.base/java.lang.ClassLoader.defineClass1(Native Method)
at java.base/java.lang.ClassLoader.defineClass(ClassLoader.java:1017)
at java.base/java.security.SecureClassLoader.defineClass(SecureClassLoader.java:151)
at java.base/jdk.internal.loader.BuiltinClassLoader.defineClass(BuiltinClassLoader.java:821)
at java.base/jdk.internal.loader.BuiltinClassLoader.findClassOnClassPathOrNull(BuiltinClassLoader.java:719)
at java.base/jdk.internal.loader.BuiltinClassLoader.loadClassOrNull(BuiltinClassLoader.java:642)
at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:600)
at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:178)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
at org.apache.flink.table.planner.expressions.PlannerTypeInferenceUtilImpl.<clinit>(PlannerTypeInferenceUtilImpl.java:51)
at org.apache.flink.table.planner.delegation.PlannerBase.<init>(PlannerBase.scala:92)
at org.apache.flink.table.planner.delegation.StreamPlanner.<init>(StreamPlanner.scala:52)
at org.apache.flink.table.planner.delegation.DefaultPlannerFactory.create(DefaultPlannerFactory.java:61)
at org.apache.flink.table.factories.PlannerFactoryUtil.createPlanner(PlannerFactoryUtil.java:50)
at org.apache.flink.table.api.bridge.java.internal.StreamTableEnvironmentImpl.create(StreamTableEnvironmentImpl.java:151)
at org.apache.flink.table.api.bridge.java.StreamTableEnvironment.create(StreamTableEnvironment.java:128)
These are the dependencies I'm using
val flinkVersion = "1.14.3"
val flinkScalaVersion = 2.12
implementation("org.apache.flink:flink-clients_$flinkScalaVersion:$flinkVersion")
implementation("org.apache.flink:flink-table-api-java-bridge_$flinkScalaVersion:$flinkVersion")
implementation("org.apache.flink:flink-table-planner_$flinkScalaVersion:$flinkVersion")
implementation("org.apache.flink:flink-streaming-scala_$flinkScalaVersion:$flinkVersion")
implementation("org.apache.flink:flink-table-common:$flinkVersion")
implementation("org.apache.flink:flink-connector-kafka_$flinkScalaVersion:$flinkVersion")
implementation("org.apache.flink:flink-avro-confluent-registry:$flinkVersion")
implementation("org.apache.flink:flink-avro:$flinkVersion")
I'm trying to register a Scala UDF in Pyflink using an external JAR as follows, but get below error.
Scala UDF:
package com.dummy
import org.apache.flink.table.functions.ScalarFunction
class dummyTransform(factor: Int) extends ScalarFunction {
def eval(s: String): Int = {
s.hashCode()
}
}
build.sbt:
name := "hello_scala_for_flink"
version := "0.1"
scalaVersion := "2.12.11"
libraryDependencies += "org.apache.flink" % "flink-table-common" % "1.11.2" % "provided"
assembly.sbt:
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
Scala project structure:
hello_scala_for_flink/
project/
src/
main/
resources/
scala/
com.dummy/
dummyTransform
hello_scala
test
target/
build.sbt
shell:
cd hello_scala_for_flink
sbt assembly
cp ./target/scala-2.12/hello_scala_for_flink-assembly-0.1.jar /Users/py-r/opt/anaconda3/envs/venv_pyflink_37/lib/python3.7/site-packages/pyflink/lib/
Python:
from pyflink.dataset import ExecutionEnvironment
exec_env = ExecutionEnvironment.get_execution_environment()
#exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
table_env = BatchTableEnvironment.create(exec_env, t_config)
table_env.register_java_function("hash_code","com.dummy.dummyTransform")
Error (incl. after restarting Anaconda):
Py4JJavaError: An error occurred while calling o12.newInstance.
: java.lang.InstantiationException: com.dummy.dummyTransform
at java.base/java.lang.Class.newInstance(Class.java:598)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.lang.NoSuchMethodException: com.dummy.dummyTransform.<init>()
at java.base/java.lang.Class.getConstructor0(Class.java:3427)
at java.base/java.lang.Class.newInstance(Class.java:585)
11 more
Versions:
jdk = 1.8.0_151.jdk
scala = 2.12.11
python = 3.7
apache-beam = 2.19.0
apache-flink = 1.11.2
Any idea what the issue might be ?
Thanks for your support
Looks like I found the issue myself. Apparently only a class instantiation was required in above code:
class dummyTransform(factor: Int) extends ScalarFunction {
def eval(s: String): Int = {
s.hashCode() * factor
}
def this() = this(1)
}
Also, I changed to Scala 2.11.12 due to another error. Now everything seems to work: awesome !
I would like to use Slick (3.2.3) to connect to a MSSQL database.
Currently, my project is the following.
In application.conf, I have
somedbname = {
driver = "slick.jdbc.SQLServerProfile$"
db {
host = "somehost"
port = "someport"
databaseName = "Recupel.Datawarehouse"
url = "jdbc:sqlserver://"${somedbname.db.host}":"${somedbname.db.port}";databaseName="${somedbname.db.databaseName}";"
user = "someuser"
password = "somepassword"
}
}
The "somehost" looks like XX.X.XX.XX where X's are numbers.
My build.sbt contains
name := "test-slick"
version := "0.1"
scalaVersion in ThisBuild := "2.12.7"
libraryDependencies ++= Seq(
"com.typesafe.slick" %% "slick" % "3.2.3",
"com.typesafe.slick" %% "slick-hikaricp" % "3.2.3",
"org.slf4j" % "slf4j-nop" % "1.6.4",
"com.microsoft.sqlserver" % "mssql-jdbc" % "7.0.0.jre10"
)
The file with the "main" object contains
import slick.basic.DatabaseConfig
import slick.jdbc.JdbcProfile
import slick.jdbc.SQLServerProfile.api._
import scala.concurrent.Await
import scala.concurrent.duration._
val dbConfig: DatabaseConfig[JdbcProfile] = DatabaseConfig.forConfig("somedbname")
val db: JdbcProfile#Backend#Database = dbConfig.db
def main(args: Array[String]): Unit = {
try {
val future = db.run(sql"SELECT * FROM somettable".as[(Int, String, String, String, String,
String, String, String, String, String, String, String)])
println(Await.result(future, 10.seconds))
} finally {
db.close()
}
}
}
This, according to all the documentation that I could find, should be enough to connect to the database. However, when I run this, I get
[error] (run-main-0) java.sql.SQLTransientConnectionException: somedbname.db - Connection is not available, request timed out after 1004ms.
[error] java.sql.SQLTransientConnectionException: somedbname.db - Connection is not available, request timed out after 1004ms.
[error] at com.zaxxer.hikari.pool.HikariPool.createTimeoutException(HikariPool.java:548)
[error] at com.zaxxer.hikari.pool.HikariPool.getConnection(HikariPool.java:186)
[error] at com.zaxxer.hikari.pool.HikariPool.getConnection(HikariPool.java:145)
[error] at com.zaxxer.hikari.HikariDataSource.getConnection(HikariDataSource.java:83)
[error] at slick.jdbc.hikaricp.HikariCPJdbcDataSource.createConnection(HikariCPJdbcDataSource.scala:14)
[error] at slick.jdbc.JdbcBackend$BaseSession.<init>(JdbcBackend.scala:453)
[error] at slick.jdbc.JdbcBackend$DatabaseDef.createSession(JdbcBackend.scala:46)
[error] at slick.jdbc.JdbcBackend$DatabaseDef.createSession(JdbcBackend.scala:37)
[error] at slick.basic.BasicBackend$DatabaseDef.acquireSession(BasicBackend.scala:249)
[error] at slick.basic.BasicBackend$DatabaseDef.acquireSession$(BasicBackend.scala:248)
[error] at slick.jdbc.JdbcBackend$DatabaseDef.acquireSession(JdbcBackend.scala:37)
[error] at slick.basic.BasicBackend$DatabaseDef$$anon$2.run(BasicBackend.scala:274)
[error] at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1135)
[error] at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
[error] at java.base/java.lang.Thread.run(Thread.java:844)
[error] Nonzero exit code: 1
Perhaps related and also annoying, when I run this code for the second (and subsequent) times, I get the following error instead:
Failed to get driver instance for jdbcUrl=jdbc:sqlserver://[...]
which forces me to kill and reload sbt each time.
What am I doing wrong? Worth noting: I can connect to the database with the same credential from a software like valentina.
As suggested by #MarkRotteveel, and following this link, I found a solution.
First, I explicitly set the driver, adding the line
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
in the db dictionary, after password = "somepassword".
Secondly, the default timeout (after one second) appears to be too short for my purposes, and therefore I added the line
connectionTimeout = "30 seconds"
after the previous driver line, still in the db dictionary.
Now it works.
I'm trying to consume some kafka topics through flink streams.
Below is the code for the stream.
object VehicleFuelEventStream {
def sink[IN](hosts: String, table: String, ds: DataStream[IN]): CassandraSink[IN] = {
CassandraSink.addSink(ds)
.setQuery(s"INSERT INTO global.$table values (id, vehicle_id, consumption, from, to, version) values (?, ?, ?, ?, ?, ?);")
.setClusterBuilder(new ClusterBuilder() {
override def buildCluster(builder: Cluster.Builder): Cluster = {
builder.addContactPoints(hosts.split(","): _*).build()
}
}).build()
}
def dataStream[T: TypeInformation](env: StreamExecutionEnvironment,
source: SourceFunction[String],
flinkConfigs: FlinkStreamProcessorConfigs,
windowFunction: WindowFunction[VehicleFuelEvent, (String, Double, SortedMap[Date, Double], Long, Long), String, TimeWindow]): DataStream[(String, Double, SortedMap[Date, Double], Long, Long)] = {
val fuelEventStream = env.addSource(source)
.map(e => EventSerialiserRepo.fromJsonStr[VehicleFuelEvent](e))
.filter(_.isSuccess)
.map(_.get)
.assignTimestampsAndWatermarks(VehicleFuelEventTimestampExtractor.withWatermarkDelay(flinkConfigs.windowWatermarkDelay))
.keyBy(_.vehicleId) // key by VehicleId
.timeWindow(Time.minutes(flinkConfigs.windowTimeWidth.toMinutes))
.apply(windowFunction)
fuelEventStream
}
}
Stream is triggered when play framework create it's dependencies on startup via google Guice as below.
#Singleton
class VehicleEventKafkaConsumer #Inject()(conf: Configuration,
lifecycle: ApplicationLifecycle,
repoFactory: StorageFactory,
cache: CacheApi,
cassandra: Cassandra,
fleetConfigs: FleetConfigManager) {
private val kafkaConfigs = KafkaConsumerConfigs(conf)
private val flinkConfigs = FlinkStreamProcessorConfigs(conf)
private val topicsWithClassTags = getClassTagsForTopics
private val cassandraConfigs = CassandraConfigs(conf)
private val repoCache = mutable.HashMap.empty[String, CachedSubjectStatePersistor]
private val props = new Properties()
// add props
private val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.enableCheckpointing(flinkConfigs.checkpointingInterval.toMillis)
if (flinkConfigs.enabled) {
topicsWithClassTags.toList
.map {
case (topic, tag) if tag.runtimeClass.isAssignableFrom(classOf[VehicleFuelEvent]) =>
Logger.info(s"starting for - $topic and $tag")
val source = new FlinkKafkaConsumer09[String](topic, new SimpleStringSchema(), props)
val fuelEventStream = VehicleFuelEventStream.dataStream[String](env, source, flinkConfigs, new VehicleFuelEventWindowFunction)
VehicleFuelEventStream.sink(cassandraConfigs.hosts, flinkConfigs.cassandraTable, fuelEventStream)
case (topic, _) =>
Logger.info(s"no stream processor found for topic $topic")
}
Logger.info("starting flink stream processors")
env.execute("flink vehicle event processors")
} else
Logger.info("Flink stream processor is disabled!")
}
I get the below errors on application startup.
03/13/2018 05:47:23 TriggerWindow(TumblingEventTimeWindows(1800000), ListStateDescriptor{serializer=org.apache.flink.api.common.typeutils.base.ListSerializer#e899e41f}, EventTimeTrigger(), WindowedStream.apply(WindowedStream.scala:582)) -> Sink: Cassandra Sink(4/4) switched to RUNNING
2018-03-13 05:47:23,262 - [info] o.a.f.r.t.Task - TriggerWindow(TumblingEventTimeWindows(1800000), ListStateDescriptor{serializer=org.apache.flink.api.common.typeutils.base.ListSerializer#e899e41f}, EventTimeTrigger(), WindowedStream.apply(WindowedStream.scala:582)) -> Sink: Cassandra Sink (3/4) (d5124be9bcef94bd0e305c4b4546b055) switched from RUNNING to FAILED.
org.apache.flink.streaming.runtime.tasks.StreamTaskException: Cannot load user class: streams.fuelevent.VehicleFuelEventWindowFunction
ClassLoader info: URL ClassLoader:
Class not resolvable through given classloader.
at org.apache.flink.streaming.api.graph.StreamConfig.getStreamOperator(StreamConfig.java:232)
at org.apache.flink.streaming.runtime.tasks.OperatorChain.<init>(OperatorChain.java:95)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:231)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:718)
at java.lang.Thread.run(Thread.java:745)
2018-03-13 05:47:23,262 - [info] o.a.f.r.t.Task - TriggerWindow(TumblingEventTimeWindows(1800000), ListStateDescriptor{serializer=org.apache.flink.api.common.typeutils.base.ListSerializer#e899e41f}, EventTimeTrigger(), WindowedStream.apply(WindowedStream.scala:582)) -> Sink: Cassandra Sink (2/4) (6b3de15a4f6
.....
org.apache.flink.streaming.runtime.tasks.StreamTaskException: Could not instantiate outputs in order.
at org.apache.flink.streaming.api.graph.StreamConfig.getOutEdgesInOrder(StreamConfig.java:394)
at org.apache.flink.streaming.runtime.tasks.OperatorChain.<init>(OperatorChain.java:103)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:231)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:718)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: streams.fuelevent.VehicleFuelEventStream$$anonfun$4
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoaders$ChildFirstClassLoader.loadClass(FlinkUserCodeClassLoaders.java:128)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
2018-03-13 05:47:23,336 - [info] o.a.f.r.t.Task - Source: Custom Source -> Map -> Filter -> Map -> Timestamps/Watermarks (4/4) (97b2313c985592fdec0ac4f7fba8062f) switched from RUNNING to FAILED.
dependencies
// kafka
libraryDependencies += "org.apache.kafka" %% "kafka" % "1.0.0"
//flink
libraryDependencies += "org.apache.flink" %% "flink-streaming-scala" % "1.4.2"
libraryDependencies += "org.apache.flink" %% "flink-connector-kafka-0.9" % "1.4.0"
libraryDependencies += "org.apache.flink" %% "flink-connector-cassandra" % "1.4.2"
Any help is appreciated to solve this issue.
Look like Flink can't found this class streams.fuelevent.VehicleFuelEventStream
It's that class inside Flink classpath or inside you Jar file?
Flink doc could bring some light on this https://ci.apache.org/projects/flink/flink-docs-release-1.4/monitoring/debugging_classloading.html
I am trying to run the test of a project I cloned. I am new to Scala and Activator and I can not figure out what is happening
in the root folder I have the build.sbt
name := "xxxxx"
version := "2.0-SNAPSHOT"
scalaVersion := "2.11.5"
doc in Compile <<= target.map(_ / "none")
scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
lazy val root = (project in file(".")).enablePlugins(PlayScala)
libraryDependencies ++= Seq(
jdbc,
anorm,
cache,
"org.scalatest" % "scalatest_2.11" % "2.2.4" % "test",
"json-module" %% "json-module" % "2.0.0",
)
instrumentSettings
ScoverageKeys.excludedPackages in ScoverageCompile := "<empty>;views*;views.*;views*html;Routes*;controllers*routes*;controllers*Reverse*;controllers*javascript*;controller*ref*"
ScoverageKeys.highlighting := true
This is an example file
class TestExampleSpec extends AbstractSpec {
"TestExample#get(:testId)" must {
val url = controllers.routes.TestExample.get(1, 1, 10).url
"return 400 without X-Client-Id header" in {
val fakeRequest = FakeRequest("GET", url).withTextBody("")
val result: Future[Result] = controllers.TestExample.get(1, 1, 10)(fakeRequest)
status(result) mustBe 400
contentType(result) mustBe Some("application/json")
val body = contentAsString(result)
val bodyJson = Json.parse(body)
(bodyJson \ "error").asOpt[Boolean] mustBe Some(true)
(bodyJson \ "message").asOpt[String] mustBe Some("Missing X-Client-Id header")
}
I tried this commands
$ activator reload
$ activator clean
$ activator test
But none of this worked I get this kind of messages
home/username/code/testing-project/test/TestExampleSpec.scala:10: value in is not a member of String
"return 400 without X-Client-Id header" in {
Thank you
I think the problem was related because the project was changed from play 2.2 to play 2.3.8
I added this dependency
"org.scalatestplus" %% "play" % "1.2.0" % "test"
Which seems did the trick.