JSON Sample data
{"location": {"city": "Lexington","zip": "40503"},"dimensions": {"sq_ft": "1000"},"type": "Residential","sale_date": "2016-02-6","price": "75836"},
{"location": {"city": "Belmont","zip": "02478"},"dimensions": {"sq_ft": "1103"},"type": "Residential","sale_date": "2016-10-16","price": "92567"},
{"location": {"city": "Winchester","zip": "01890"},"dimensions": {"sq_ft": "1122"},"type": "Condo","sale_date": "2016-11-26","price": "89921"},
{"location": {"city": "Winchester","zip": "01890"},"dimensions": {"sq_ft": "1122"},"type": "Condo","sale_date": "","price": "89921"}
ddl
create or replace table sales_json_details(
dimensions varchar(1000),
location varchar(1000),
price number,
sale_date timestamp,
type varchar(100)
)
query to load the data
copy into sales_json_details from #internal_stage_json_demo
file_format = (type = json NULL_IF='' )
MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE;
Exception:
**Failed to cast variant value "" to TIMESTAMP_NTZ**
How to load NULL instead of empty string "" with copy Options MATCH_BY_COLUMN_NAME= CASE_INSENSITIVE
It seems it tries to convert to timestamp before checking NULL_IF, so you may use NULLIF function:
create or replace file format JSONFORMAT type = JSON;
copy into sales_json_details from (
SELECT $1:dimensions::VARCHAR,
$1:location::VARCHAR,
$1:price::VARCHAR,
NULLIF($1:sale_date::varchar,'' ),
$1:type::VARCHAR
from #internal_stage_json_demo ( FILE_FORMAT => JSONFORMAT));
It loads the data as expected.
When we transform the columns, MATCH_BY_COLUMN_NAME is not needed and not supported. If the primary aim is to use MATCH_BY_COLUMN_NAME, you should define the sale_date column as varchar:
create or replace table sales_json_details( dimensions varchar(1000), location varchar(1000), price number, sale_date varchar, type varchar(100) );
copy into sales_json_details from #internal_stage_json_demo
file_format = (type = json NULL_IF='' )
MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE;
Related
I am copying the data into snowflake table which has three columns: ID, DATA and ETL_LOAD_TIMESTAMP.
I have a column ETL_LOAD_TIMESTAMP in snowflake of type TIMESTAMP_TZ(9) and I have set its default value as CURRENT_TIMESTAMP().
I get my data from a CSV file, which is of type:
ID, DATA
1, Dummy
I download the csv file at tmpdir location on local. I load the data of this csv into snowflake as:
create_cmd = "CREATE TEMPORARY STAGE teamp123 COMMENT = 'TEMPORARY STAGE FOR TEST_TABLE1 DATA LOAD'"
self.connection.execute("ALTER SESSION SET TIMEZONE = 'UTC';")
self.connection.execute(create_cmd)
self.connection.execute(f"put file://tmpdir/* #temp123 PARALLEL=8")
self.connection.execute("COPY INTO TEST_TABLE1 FROM #temp123 PURGE = TRUE FILE_FORMAT = (TYPE = 'CSV' field_delimiter = ',' FIELD_OPTIONALLY_ENCLOSED_BY = '\"' ESCAPE_UNENCLOSED_FIELD = None error_on_column_count_mismatch=false SKIP_HEADER = 1)")
I get the values of ID and Data but the ETL_LOAD_TIMESTAMP is null.
How do I modify this copy command so that I get the default value of ETL_LOAD_TIMESTAMP which is current timestamp instead of null?
you can use default current_timestamp() while defining datatypes or explicit to_timestamp
https://docs.snowflake.com/en/user-guide/data-load-transform.html#current-time-current-timestamp-default-column-values
I am trying to load a CSV file into Snowflake. The sample format of the input csv table in s3 location is as follows (with 2 columns: ID, Location_count):
Input csv table
I need to transform it in the below format:(with 3 columns:ID, Location, Count)
Output csv table
However when I am trying to load the input file using the following query after creating database, external stage and file format, it returns LOAD_FAILED
create or replace table table_name
(
id integer,
Location_count variant
);
select parse_json(Location_count) as c;
list #stage_name;
copy into table_name from #stage_name file_format = 'fileformatname' on_error = 'continue';
you will probably need to parse_json that 2nd column as part of a copy-transformation. For example:
create file format myformat
type = csv field_delimiter = ','
FIELD_OPTIONALLY_ENCLOSED_BY = '"';
create or replace stage csv_stage file_format = (format_name = myformat);
copy into #csv_stage from
( select '1',
'{"SHS-TRN":654738,"PRN-UTN":78956,"NCT-JHN":96767}') ;
create or replace table blah (id integer, something variant);
copy into blah from (select $1, parse_json($2) from #csv_stage);
I'm creating an external table using a CSV stored in an Azure Data Lake Storage and populating the table using Polybase in SQL Server.
However, I ran into this problem and figured it may be due to the fact that in one particular column there are double quotes present within the string, and the string delimiter has been specified as " in Polybase (STRING_DELIMITER = '"').
HdfsBridge::recordReaderFillBuffer - Unexpected error encountered filling record reader buffer: HadoopExecutionException: Could not find a delimiter after string delimiter
Example:
I have done quite an extensive research in this and found that this issue has been around for years but yet to see any solutions given.
Any help will be appreciated.
I think the easiest way to fix this up because you are in charge of the .csv creation is to use a delimiter which is not a comma and leave off the string delimiter. Use a separator which you know will not appear in the file. I've used a pipe in my example, and I clean up the string once it is imported in to the database.
A simple example:
IF EXISTS ( SELECT * FROM sys.external_tables WHERE name = 'delimiterWorking' )
DROP EXTERNAL TABLE delimiterWorking
GO
IF EXISTS ( SELECT * FROM sys.tables WHERE name = 'cleanedData' )
DROP TABLE cleanedData
GO
IF EXISTS ( SELECT * FROM sys.external_file_formats WHERE name = 'ff_delimiterWorking' )
DROP EXTERNAL FILE FORMAT ff_delimiterWorking
GO
CREATE EXTERNAL FILE FORMAT ff_delimiterWorking
WITH (
FORMAT_TYPE = DELIMITEDTEXT,
FORMAT_OPTIONS (
FIELD_TERMINATOR = '|',
--STRING_DELIMITER = '"',
FIRST_ROW = 2,
ENCODING = 'UTF8'
)
);
GO
CREATE EXTERNAL TABLE delimiterWorking (
id INT NOT NULL,
body VARCHAR(8000) NULL
)
WITH (
LOCATION = 'yourLake/someFolder/delimiterTest6.txt',
DATA_SOURCE = ds_azureDataLakeStore,
FILE_FORMAT = ff_delimiterWorking,
REJECT_TYPE = VALUE,
REJECT_VALUE = 0
);
GO
SELECT *
FROM delimiterWorking
GO
-- Fix up the data
CREATE TABLE cleanedData
WITH (
CLUSTERED COLUMNSTORE INDEX,
DISTRIBUTION = ROUND_ROBIN
)
AS
SELECT
id,
body AS originalCol,
SUBSTRING ( body, 2, LEN(body) - 2 ) cleanBody
FROM delimiterWorking
GO
SELECT *
FROM cleanedData
My results:
String Delimiter issue can be avoided if you have the Data lake flat file converted to Parquet format.
Input:
"ID"
"NAME"
"COMMENTS"
"1"
"DAVE"
"Hi "I am Dave" from"
"2"
"AARO"
"AARO"
Steps:
1 Convert Flat file to Parquet format [Using Azure Data factory]
2 Create External File format in Data Lake [Assuming Master key, Scope credentials available]
CREATE EXTERNAL FILE FORMAT PARQUET_CONV
WITH (FORMAT_TYPE = PARQUET,
DATA_COMPRESSION = 'org.apache.hadoop.io.compress.SnappyCodec'
);
3 Create External Table with FILE_FORMAT = PARQUET_CONV
Output:
I believe this is the best option as Microsoft don't have an solution currently to handle this string delimiter occurring with in the data for External table
I use copy command of snowflake which is below returns a file with content json
copy into #elasticsearch/product/sf_index
from (select object_construct('id',id, alpha,'alpha')from table limit 1)
file_format = (type = json, COMPRESSION=NONE), overwrite=TRUE, single = TRUE, max_file_size=5368709120;
data is
id alpha
1 null
the output file is
{
"id" :1
}
but I need to have the null values
{
"id" : 1,
"alpha" : null
}
You can use the function OBJECT_CONSTRUCT_KEEP_NULL.
Documentation: https://docs.snowflake.com/en/sql-reference/functions/object_construct_keep_null.html
Example:
select OBJECT_CONSTRUCT_KEEP_NULL('id',id, alpha,'alpha')
Will it be possible for you to check programmatically if the value is null and it is null use the below
select object_construct('id',1,'alpha',parse_json('null'));
Per SnowFlake documentation
If the key or value is NULL (i.e. SQL NULL), the key-value pair will be omitted from the resulting object. A key-value pair consisting of a not-null string as key and a JSON NULL as value (i.e. PARSE_JSON(‘NULL’)) will not be omitted.
The other option is, just send it without the null attribute in Elastic and then take care of the retrieval from Elastic.
How about this
select object_construct('id',id, 'alpha', case when alpha is not null then alpha else 'null' end )from table limit 1;
case should be supported by the copy command.
"null" is a valid in json document as per this SO
Is null valid JSON (4 bytes, nothing else)
Ok another possible way is this using union
select object_construct('id',id, 'alpha', parse_json('NULL') )from table where alpha is null
union
select object_construct('id',id, 'alpha', alpha )from table where alpha is not null;
select object_construct('id', id,'alpha', IFNULL(alpha, PARSE_JSON('null'))) from table limit 1
Use IFNULL to check if the value is null and replace with JSON 'null'
I have a table with a JSON text field:
create table breaches(breach_id int, detail text);
insert into breaches values
( 1,'[{"breachedState": null},
{"breachedState": "PROCESS_APPLICATION",}]')
I'm trying to use MSSQL's in build JSON parsing functions to test whether ANY object in a JSON array has a matching member value.
If the detail field was a single JSON object, I could use:
select * from breaches
where JSON_VALUE(detail,'$.breachedState') = 'PROCESS_APPLICATION'
but it's an Array, and I want to know if ANY Object has breachedState = 'PROCESS_APPLICATION'
Is this possible using MSSQL's JSON functions?
You can use function OPENJSON to check each object, try this query:
select * from breaches
where exists
(
select *
from
OPENJSON (detail) d
where JSON_VALUE(value,'$.breachedState') = 'PROCESS_APPLICATION'
)
Btw, there is an extra "," in your insert query, it should be:
insert into breaches values
( 1,'[{"breachedState": null},
{"breachedState": "PROCESS_APPLICATION"}]')