select max(primary_key) bad performance in clickhouse with partition by toYearWeek - query-optimization

The following clickhouse table is partitioned by week of the year PARTITION BY toYearWeek(receive_time_utc) and ordered by primary key where time is first element of the primary key tuple: ORDER BY (receive_time_utc, seqno, rowno)
However, SELECT max(receive_time_utc) FROM roq_bbo_okx seems to do full scan of the table, instead of last partition only.
Could it be due to optimizer does not understand toYearWeek function? I also so people using toYYYYMM function and intDivs
CREATE TABLE default.roq_bbo_okx
(
`gateway` LowCardinality(String),
`session_id` UUID CODEC(LZ4),
`seqno` UInt64 CODEC(DoubleDelta, LZ4),
`rowno` UInt32 DEFAULT 0 CODEC(DoubleDelta, LZ4),
`receive_time_utc` DateTime64(9, 'UTC') CODEC(DoubleDelta, LZ4),
`exchange_time_utc` DateTime64(9, 'UTC') CODEC(DoubleDelta, LZ4),
`origin_create_time_utc` DateTime64(9, 'UTC') DEFAULT receive_time_utc CODEC(DoubleDelta, LZ4),
`snapshot` Int8 CODEC(LZ4),
`exchange` LowCardinality(String) CODEC(LZ4),
`symbol` LowCardinality(String) CODEC(LZ4),
`type` Enum8('UNDEFINED' = 0, 'BID' = 48, 'ASK' = 49, 'TRADE' = 50, 'IMPLIED_BID' = 69, 'IMPLIED_ASK' = 70, 'BOOK_RESET' = 74) DEFAULT 'UNDEFINED' CODEC(LZ4),
`action` Enum8('UNDEFINED' = 0, 'NEW' = 48, 'CHANGE' = 49, 'DELETE' = 50) DEFAULT 'UNDEFINED' CODEC(LZ4),
`side` Enum8('UNDEFINED' = 0, 'BUY' = 48, 'SELL' = 49) DEFAULT 'UNDEFINED' CODEC(LZ4),
`price_tick` Float64 DEFAULT toFloat64(nan) CODEC(Gorilla, LZ4),
`price_i` Int64 DEFAULT -9223372036854775808 CODEC(DoubleDelta, LZ4),
`qty_tick` Float64 DEFAULT toFloat64(nan) CODEC(Gorilla, LZ4),
`qty_i` Int64 DEFAULT -9223372036854775808 CODEC(DoubleDelta, LZ4),
`posno` UInt32 DEFAULT 0 CODEC(DoubleDelta, LZ4),
`num_orders` UInt16 CODEC(Gorilla, LZ4),
`trade_id` String CODEC(LZ4)
)
ENGINE = MergeTree
PARTITION BY toYearWeek(receive_time_utc)
PRIMARY KEY (receive_time_utc, seqno, rowno)
ORDER BY (receive_time_utc, seqno, rowno)
SETTINGS index_granularity = 8192

There is no such optimization yet. CH is unable to use primary index for max(). In some simple cases CH is able to use a virtual projection based on partition.idx (partition columns).
You can do manual optimization order by ...desc limit 1
SELECT max(receive_time_utc)
FROM roq_bbo_okx
order by receive_time_utc desc limit 1

Related

Apache Flink issue with JOINS on Kinesis Streams Rowtime attributes must not be in the input rows of a regular join

i am attempting a simple exercise
i have Two kinesis data stream
order-stream
shipment-stream
SQL 1 Orders
%flink.ssql
CREATE TABLE orders (
orderid VARCHAR(6),
orders VARCHAR,
ts TIMESTAMP(3),
WATERMARK FOR ts AS ts - INTERVAL '5' SECOND
)
WITH (
'connector' = 'kinesis',
'stream' = 'order-stream',
'aws.region' = 'us-east-1',
'scan.stream.initpos' = 'TRIM_HORIZON',
'format' = 'json',
'json.timestamp-format.standard' = 'ISO-8601'
);
SQL 2 shipment
CREATE TABLE shipment (
orderid VARCHAR(6),
shipments VARCHAR(6),
ts TIMESTAMP(3),
WATERMARK FOR ts AS ts - INTERVAL '5' SECOND
)
WITH (
'connector' = 'kinesis',
'stream' = 'shipment-stream',
'aws.region' = 'us-east-1',
'scan.stream.initpos' = 'TRIM_HORIZON',
'format' = 'json',
'json.timestamp-format.standard' = 'ISO-8601'
);
Generating Fake Data into Kinesis Via Python
try:
import datetime
import json
import random
import boto3
import os
import uuid
import time
from dotenv import load_dotenv
load_dotenv(".env")
except Exception as e:
pass
STREAM_NAME_Order = "order-stream"
STREAM_NAME_Shipments = "shipment-stream"
def send_data(kinesis_client):
order_items_number = random.randrange(1, 10000)
order_items = {
"orderid": order_items_number,
"orders": "1",
'ts': datetime.datetime.now().isoformat()
}
shipping_data = {
"orderid": order_items_number,
"shipments": random.randrange(1, 10000),
'ts': datetime.datetime.now().isoformat()
}
partition_key = uuid.uuid4().__str__()
res = kinesis_client.put_record(
StreamName=STREAM_NAME_Order,
Data=json.dumps(order_items),
PartitionKey=partition_key)
print(res)
time.sleep(2)
res = kinesis_client.put_record(
StreamName=STREAM_NAME_Shipments,
Data=json.dumps(shipping_data),
PartitionKey=partition_key)
print(res)
if __name__ == '__main__':
kinesis_client = boto3.client('kinesis',
aws_access_key_id=os.getenv("DEV_ACCESS_KEY"),
aws_secret_access_key=os.getenv("DEV_SECRET_KEY"),
region_name="us-east-1",
)
for i in range(1, 10):
send_data(kinesis_client)
%flink.ssql(type=update)
SELECT DISTINCT oo.orderid , TUMBLE_START(oo.ts, INTERVAL '10' MINUTE) as event_time
FROM orders as oo
GROUP BY orderid , TUMBLE(oo.ts, INTERVAL '10' MINUTE);
issue with Joining
%flink.ssql(type=update)
SELECT DISTINCT oo.orderid , TUMBLE_START(oo.ts, INTERVAL '10' MINUTE) as event_time , ss.shipments
FROM orders as oo
JOIN shipment AS ss ON oo.orderid = ss.orderid
GROUP BY oo.orderid , TUMBLE(oo.ts, INTERVAL '10' MINUTE) , ss.shipments
Error Messages
TableException: Rowtime attributes must not be in the input rows of a regular join. As a workaround you can cast the time attributes of input tables to TIMESTAMP before.
java.io.IOException: Fail to run stream sql job
at org.apache.zeppelin.flink.sql.AbstractStreamSqlJob.run(AbstractStreamSqlJob.java:172)
at org.apache.zeppelin.flink.sql.AbstractStreamSqlJob.run(AbstractStreamSqlJob.java:105)
at org.apache.zeppelin.flink.FlinkStreamSqlInterpreter.callInnerSelect(FlinkStreamSqlInterpreter.java:89)
at org.apache.zeppelin.flink.FlinkSqlInterrpeter.callSelect(FlinkSqlInterrpeter.java:503)
at org.apache.zeppelin.flink.FlinkSqlInterrpeter.callCommand(FlinkSqlInterrpeter.java:266)
at org.apache.zeppelin.flink.FlinkSqlInterrpeter.runSqlList(FlinkSqlInterrpeter.java:160)
at org.apache.zeppelin.flink.FlinkSqlInterrpeter.internalInterpret(FlinkSqlInterrpeter.java:112)
at org.apache.zeppelin.interpreter.AbstractInterpreter.interpret(AbstractInterpreter.java:47)
at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:110)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:852)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:744)
at org.apache.zeppelin.scheduler.Job.run(Job.java:172)
at org.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132)
at org.apache.zeppelin.scheduler.ParallelScheduler.lambda$runJobInScheduler$0(ParallelScheduler.java:46)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.flink.table.api.TableException: Cannot generate a valid execution plan for the given query:
Also tried
%flink.ssql(type=update)
SELECT DISTINCT oo.orderid ,
TUMBLE_START( oo.ts, INTERVAL '1' MINUTE),
ss.shipments
FROM orders as oo
JOIN shipment AS ss ON oo.orderid = ss.orderid
GROUP BY oo.orderid ,
TUMBLE(CAST(oo.ts AS TIME) ,INTERVAL '1' MINUTE) ,
ss.shipments
Error Message :
SQL validation failed. From line 2, column 17 to line 2, column 57: Call to auxiliary group function 'TUMBLE_START' must have matching call to group function '$TUMBLE' in GROUP BY clause
i am not sure what exactly needs to be done here any help would be great. looking fwed to hear back from expert
TableException: Rowtime attributes must not be in the input rows of a regular join.
As a workaround you can cast the time attributes of input tables to TIMESTAMP before.
It is not possible for a regular join to have time attributes in its results, because the time attribute could not be well-defined. This is because the rows in a dynamic table must be at least roughly ordered by the time attribute, and there's no way to guarantee this for the result of a regular join (as opposed to an interval join, temporal join, or lookup join).
In versions of Flink before 1.14, the implementation dealt with this by not allowing regular joins to have time attributes in the input tables. While this avoided the problem, it was overly restrictive.
In your case I suggest you rewrite the join as an interval join, so that the output of the join will have a time attribute, making it possible to apply windowing to it.
In the second query, I'm not 100% sure what the problem is, but I suspect the problem is that in one case you are using oo.ts, vs CAST(oo.ts AS TIME) in the other. I think they need to be the same. I don't think Flink's SQL planner is smart enough to figure out what's going on here.

Snowflake external table partition by "Defining expression for partition column year is invalid"

I have a parquet asset in s3 and wish to make an external table from this asset
The asset is partitioned by year, month, day and hour.
My DDL is below
CREATE OR REPLACE external TABLE abc (
"year" int as (value:"partition_0"::int),
"month" int as (value:"partition_1"::int),
"day" int as (value:"partition_2"::int),
"hour" int as (value:"partition_3"::int),
"partition_key" varchar as (METADATA$EXTERNAL_TABLE_PARTITION)
)
PARTITION BY ("year", "month", "day", "hour")
PARTITION_TYPE = USER_SPECIFIED
WITH location = #abc
auto_refresh = true
file_format = (type = parquet);
When I try to partition by the following I get the following error
PARTITION BY ("year", "month", "day", "hour")
>>>Error: Defining expression for partition column year is invalid.
When I try to partition by partition_key as below, I don't get an error, but the external table is now empty
PARTITION BY ("partition_key")
>>> empty table
Anyone know what's going on here and how I can rectify this?
Ok, so using the CITIBIKE data which has parquet files:
s3://snowflake-workshop-lab/citibike-trips-parquet/
us-east-1
EXTERNAL
AWS
I can recreate the error using the file names parts as year/month/day:
create or replace external table cb (
trip_id int as (value:TRIPID::int),
filename char as (metadata$FILENAME),
year int as (split_part(metadata$FILENAME,'/', 2)::int),
month int as (split_part(metadata$FILENAME,'/', 3)::int),
day int as (split_part(metadata$FILENAME,'/', 4)::int)
)
partition by (year, month, day)
partition_type = USER_SPECIFIED
with location = #CITIBIKE_TRIPS_PARQUET
auto_refresh = true
file_format = (type=parquet);
Defining expression for partition column YEAR is invalid.
If I comment out:
--partition by (year, month, day)
--partition_type = USER_SPECIFIED
It creates and I can read rows:
select * from cb limit 2;
VALUE
TRIP_ID
FILENAME
YEAR
MONTH
DAY
{ "BIKEID": "2013-268", ...
813124
citibike-trips-parquet/2013/06/10/data_01a19496-0601-8b21-003d-9b03003c624a_1106_0_0.snappy.parquet
2,013
6
10
{ "BIKEID": "2013-220", ...
813161
citibike-trips-parquet/2013/06/10/data_01a19496-0601-8b21-003d-9b03003c624a_1106_0_0.snappy.parquet
2,013
6
10
Reading the create-external-table docs for partitioning-parameters the AWS example pulls a paths apart, but turns it back into a single date field:
create external table et1(
date_part date as to_date(split_part(metadata$filename, '/', 3)
|| '/' || split_part(metadata$filename, '/', 4)
|| '/' || split_part(metadata$filename, '/', 5), 'YYYY/MM/DD'),
timestamp bigint as (value:timestamp::bigint),
col2 varchar as (value:col2::varchar))
partition by (date_part)
thus:
create or replace external table cb (
trip_id int as (value:TRIPID::int),
filename char as (metadata$FILENAME),
date_part date as to_date(
split_part(metadata$FILENAME,'/', 2) || '/' ||
split_part(metadata$FILENAME,'/', 3) || '/' ||
split_part(metadata$FILENAME,'/', 4), 'YYYY/MM/DD')
)
partition by (date_part)
--partition_type = USER_SPECIFIED
with location = #CITIBIKE_TRIPS_PARQUET
auto_refresh = true
file_format = (type=parquet);
and then add a filter to my query:
select * from cb where date_part > '2020-04-01' limit 2;
and the query profile shows a limited set of partitions where scanned (as one would expect)
if I add partition_type = USER_SPECIFIED back into the code I get the error:
Defining expression for partition column DATE_PART is invalid.
which makes me think the original example would have worked, if this was dropped also, which testings shows that it does:
create or replace external table cb (
trip_id int as (value:TRIPID::int),
filename char as (metadata$FILENAME),
year int as (split_part(metadata$FILENAME,'/', 2)::int),
month int as (split_part(metadata$FILENAME,'/', 3)::int),
day int as (split_part(metadata$FILENAME,'/', 4)::int)
)
partition by (year, month, day)
--partition_type = USER_SPECIFIED
with location = #CITIBIKE_TRIPS_PARQUET
auto_refresh = true
file_format = (type=parquet);
select * from cb where year = 2018 limit 2;
TRIP_ID
FILENAME
YEAR
MONTH
DAY
145400
citibike-trips-parquet/2018/01/06/data_01a19496-0601-8b21-003d-9b03003c624a_2906_6_0.snappy.parquet
2,018
1
6
145545
citibike-trips-parquet/2018/01/06/data_01a19496-0601-8b21-003d-9b03003c624a_2906_6_0.snappy.parquet
2,018
1
6
So reading the docs again, after all that:
Defines the partition type for the external table as user-defined. The owner of the external table (i.e. the role that has the OWNERSHIP privilege on the external table) must add partitions to the external metadata manually by executing ALTER EXTERNAL TABLE … ADD PARTITION statements.
Do not set this parameter if partitions are added to the external table metadata automatically upon evaluation of expressions in the partition columns.
I suspect the first partition by line is the exactly what this note is referring to in the "don't use this if you did that"..

The Relation between structure of compound Primary key and performance of the queries in Cassandra

I have my own application case similar to Weather recording applications represented as following:
(cityID, sensorID, StartReadingTime, EndReadingTime, AverageValue)
Each city (cityID) has many sensors(sensorID) which reading the values of temperature.
I have composite key on (cityID,SensorID,StartReadingTime).
My application have three main queries:
1- Basic selection (Key lookup)
e.g : SELECT * FROM weather WHERE cityID = ? AND sensorID= ? AND StartReadingTime = ? ;
2- Range search
e.g: SELECT * FROM weather WHERE AverageValue > ? AND AverageValue < ?
3- Aggregation with range search
e.g: SELECT count(*) FROM weather WHERE AverageValue > ? AND AverageValue < ?
I created the table using Cassandra CQLSH with following primary key
PRIMARY KEY ((cityID, sensorID, StartReadingTime), AverageValue)
this the only combination of primary key that i found where i can run all my queries without any error.
Depend on this structure of the primary key, Cassandra will partitioning the data depend on the first element of the PRIMARY KEY where in this case is (cityID, sensorID, StartReadingTime) and clustering the data within the partitions depend on AverageValue. Note that i added AverageValue in the primary key so i can run greater-than and less-than operators.
My problem with this structure is that: when i run Range search and Aggregation queries, the performance is very very slow compared with Mysql, that because (as I understand) that Cassandra will do full scan in all partitions to get the results depend on structure of primary key!! i also tried to create secondary indexing on AverageValue column, but no performance found.
My question, where is my problem with this combination of the primary, partition and cluster keys ? is there any suggestions for getting benefits from secondary indexing?
The describe of the table:
CREATE TABLE quote.weather (
cityid int,
sensorid int,
startreadingtime double,
averagevalue double,
endreadingtime double,
PRIMARY KEY ((cityid, sensorid, startreadingtime), averagevalue)
) WITH CLUSTERING ORDER BY (averagevalue ASC)
AND bloom_filter_fp_chance = 0.01
AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
AND comment = ''
AND compaction = {'min_threshold': '4', 'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32'}
AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND dclocal_read_repair_chance = 0.1
AND default_time_to_live = 0
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair_chance = 0.0
AND speculative_retry = '99.0PERCENTILE';
CREATE INDEX av_index ON quote.weather (averagevalue);

auto increment ID in H2 database

Is there a way to have an auto_incrementing BIGINT ID for a table.
It can be defined like so
id bigint auto_increment
but that has no effect (it does not increment automatically).
I would like to insert all fields but the ID field - the ID field should be provided by the DBMS.
Or do I need to call something to increment the ID counter?
It works for me. JDBC URL: jdbc:h2:~/temp/test2
drop table test;
create table test(id bigint auto_increment, name varchar(255));
insert into test(name) values('hello');
insert into test(name) values('world');
select * from test;
result:
ID NAME
1 hello
2 world
IDENTITY
The modern approach uses the IDENTITY type, for automatically generating an incrementing 64-bit long integer.
This single-word syntax used in H2 is an abbreviated variation of GENERATED … AS IDENTITY defined in the SQL:2003 standard. See summary in PDF document SQL:2003 Has Been Published. Other databases are implementing this, such as Postgres.
CREATE TABLE event_
(
pkey_ IDENTITY NOT NULL PRIMARY KEY , -- ⬅ `identity` = auto-incrementing long integer.
name_ VARCHAR NOT NULL ,
start_ TIMESTAMP WITH TIME ZONE NOT NULL ,
duration_ VARCHAR NOT NULL
)
;
Example usage. No need to pass a value for our pkey column value as it is being automatically generated by H2.
INSERT INTO event_ ( name_ , start_ , stop_ )
VALUES ( ? , ? , ? )
;
And Java.
ZoneId z = ZoneId.of( "America/Montreal" ) ;
OffsetDateTime start = ZonedDateTime.of( 2021 , Month.JANUARY , 23 , 19 , 0 , 0 , 0 , z ).toOffsetDateTime() ;
Duration duration = Duration.ofHours( 2 ) ;
myPreparedStatement.setString( 1 , "Java User Group" ) ;
myPreparedStatement.setObject( 2 , start ) ;
myPreparedStatement.setString( 3 , duration.toString() ) ;
Returning generated keys
Statement.RETURN_GENERATED_KEYS
You can capture the value generated during that insert command execution. Two steps are needed. First, pass the flag Statement.RETURN_GENERATED_KEYS when getting your prepared statement.
PreparedStatement pstmt = conn.prepareStatement( sql , Statement.RETURN_GENERATED_KEYS ) ;
Statement::getGeneratedKeys
Second step is to call Statement::getGeneratedKeys after executing your prepared statement. You get a ResultSet whose rows are the identifiers generated for the created row(s).
Example app
Here is an entire example app. Running on Java 14 with Text Blocks preview feature enabled for fun. Using H2 version 1.4.200.
package work.basil.example;
import org.h2.jdbcx.JdbcDataSource;
import java.sql.*;
import java.time.*;
import java.util.Objects;
public class H2ExampleIdentity
{
public static void main ( String[] args )
{
H2ExampleIdentity app = new H2ExampleIdentity();
app.doIt();
}
private void doIt ( )
{
JdbcDataSource dataSource = Objects.requireNonNull( new JdbcDataSource() ); // Implementation of `DataSource` bundled with H2.
dataSource.setURL( "jdbc:h2:mem:h2_identity_example_db;DB_CLOSE_DELAY=-1" ); // Set `DB_CLOSE_DELAY` to `-1` to keep in-memory database in existence after connection closes.
dataSource.setUser( "scott" );
dataSource.setPassword( "tiger" );
String sql = null;
try (
Connection conn = dataSource.getConnection() ;
)
{
sql = """
CREATE TABLE event_
(
id_ IDENTITY NOT NULL PRIMARY KEY, -- ⬅ `identity` = auto-incrementing integer number.
title_ VARCHAR NOT NULL ,
start_ TIMESTAMP WITHOUT TIME ZONE NOT NULL ,
duration_ VARCHAR NOT NULL
)
;
""";
System.out.println( "sql: \n" + sql );
try ( Statement stmt = conn.createStatement() ; )
{
stmt.execute( sql );
}
// Insert row.
sql = """
INSERT INTO event_ ( title_ , start_ , duration_ )
VALUES ( ? , ? , ? )
;
""";
try (
PreparedStatement pstmt = conn.prepareStatement( sql , Statement.RETURN_GENERATED_KEYS ) ;
)
{
ZoneId z = ZoneId.of( "America/Montreal" );
ZonedDateTime start = ZonedDateTime.of( 2021 , 1 , 23 , 19 , 0 , 0 , 0 , z );
Duration duration = Duration.ofHours( 2 );
pstmt.setString( 1 , "Java User Group" );
pstmt.setObject( 2 , start.toOffsetDateTime() );
pstmt.setString( 3 , duration.toString() );
pstmt.executeUpdate();
try (
ResultSet rs = pstmt.getGeneratedKeys() ;
)
{
while ( rs.next() )
{
int id = rs.getInt( 1 );
System.out.println( "generated key: " + id );
}
}
}
// Query all.
sql = "SELECT * FROM event_ ;";
try (
Statement stmt = conn.createStatement() ;
ResultSet rs = stmt.executeQuery( sql ) ;
)
{
while ( rs.next() )
{
//Retrieve by column name
int id = rs.getInt( "id_" );
String title = rs.getString( "title_" );
OffsetDateTime odt = rs.getObject( "start_" , OffsetDateTime.class ); // Ditto, pass class for type-safety.
Instant instant = odt.toInstant(); // If you want to see the moment in UTC.
Duration duration = Duration.parse( rs.getString( "duration_" ) );
//Display values
ZoneId z = ZoneId.of( "America/Montreal" );
System.out.println( "id_" + id + " | start_: " + odt + " | duration: " + duration + " ➙ running from: " + odt.atZoneSameInstant( z ) + " to: " + odt.plus( duration ).atZoneSameInstant( z ) );
}
}
}
catch ( SQLException e )
{
e.printStackTrace();
}
}
}
Next, see results when run.
Instant, OffsetDateTime, & ZonedDateTime
At the time of this execution, my JVM’s current default time zone is America/Los_Angeles. At the point in time of the stored moment (January 23, 2021 at 7 PM in Québec), the zone America/Los_Angeles had an offset-from-UTC of eight hours behind. So the OffsetDateTime object returned by the H2 JDBC driver is set to an offset of -08:00. This is a distraction really, so in real work I would immediately convert that OffsetDateTime to either an Instant for UTC or ZonedDateTime for a specific time zone I had in mind. Be clear in understanding that the Instant, OffsetDateTime, and ZonedDateTime objects would all represent the same simultaneous moment, the same point on the timeline. Each views that same moment through a different wall-clock time. Imagine 3 people in California, Québec, and Iceland (whose zone is UTC, an offset of zero) all talking in a conference call end they each looked up at the clock on their respective wall at the same coincidental moment.
generated key: 1
id_1 | start_: 2021-01-23T16:00-08:00 | duration: PT2H ➙ running from: 2021-01-23T19:00-05:00[America/Montreal] to: 2021-01-23T21:00-05:00[America/Montreal]
By the way, in real work on an app booking future appointments, we would use a different data type in Java and in the database.
We would have used LocalDateTime and ZoneId in Java. In the database, we would have used a data type akin to the SQL standard type TIMESTAMP WITHOUT TIME ZONE with a second column for the name of the intended time zone. When retrieving values from the database to build an scheduling calendar, we would apply the time zone to the stored date-time to get a ZonedDateTime object. This would allow us to book appointments for a certain time-of-day regardless of changes to the offset-from-UTC made by the politicians in that jurisdiction.
Very simple:
id int auto_increment primary key
H2 will create Sequence object automatically
You can also use default:
create table if not exists my(id int auto_increment primary key,s text);
insert into my values(default,'foo');
id bigint(size) zerofill not null auto_increment,

SQL Server GUID sort algorithm. Why?

Problem with UniqueIdentifiers
We have an existing database which uses uniqueidentifiers extensively (unfortunately!) both as primary keys and some nullable columns of some tables. We came across a situation where some reports that run on these tables sort on these uniqueidentifiers because there is no other column in the table that would give a meaningful sort (isn't that ironic!). The intent was to sort so that it shows the items in the order they were inserted but they were not inserted using NewSequentialId() - hence a waste of time.
Fact about the Sort Algorithm
Anyway, considering SQL Server sorts uniqueidentifiers based on byte groups starting from the ending 5th byte group (6 bytes) and moving towards the 1st byte group (4 bytes) reversing the order on the 3rd byte-group (2 bytes) from right-left to left-right,
My Question
I was curious to know if there is any real life situation that this kind of sort helps at all.
How does SQL Server store the uniqueidentifier internally which might provide insight on
why it has this whacky sort algorithm?
Reference:
Alberto Ferrari's discovery of the SQL Server GUID sort
Example
Uniqueidentifiers are sorted as shown below when you use a Order By on a uniqueidentifier column having the below data.
Please note that the below data is sorted ascendingly and highest sort preference is from the 5th byte group towards the 1st byte group (backwards).
-- 1st byte group of 4 bytes sorted in the reverse (left-to-right) order below --
01000000-0000-0000-0000-000000000000
10000000-0000-0000-0000-000000000000
00010000-0000-0000-0000-000000000000
00100000-0000-0000-0000-000000000000
00000100-0000-0000-0000-000000000000
00001000-0000-0000-0000-000000000000
00000001-0000-0000-0000-000000000000
00000010-0000-0000-0000-000000000000
-- 2nd byte group of 2 bytes sorted in the reverse (left-to-right) order below --
00000000-0100-0000-0000-000000000000
00000000-1000-0000-0000-000000000000
00000000-0001-0000-0000-000000000000
00000000-0010-0000-0000-000000000000
-- 3rd byte group of 2 bytes sorted in the reverse (left-to-right) order below --
00000000-0000-0100-0000-000000000000
00000000-0000-1000-0000-000000000000
00000000-0000-0001-0000-000000000000
00000000-0000-0010-0000-000000000000
-- 4th byte group of 2 bytes sorted in the straight (right-to-left) order below --
00000000-0000-0000-0001-000000000000
00000000-0000-0000-0010-000000000000
00000000-0000-0000-0100-000000000000
00000000-0000-0000-1000-000000000000
-- 5th byte group of 6 bytes sorted in the straight (right-to-left) order below --
00000000-0000-0000-0000-000000000001
00000000-0000-0000-0000-000000000010
00000000-0000-0000-0000-000000000100
00000000-0000-0000-0000-000000001000
00000000-0000-0000-0000-000000010000
00000000-0000-0000-0000-000000100000
00000000-0000-0000-0000-000001000000
00000000-0000-0000-0000-000010000000
00000000-0000-0000-0000-000100000000
00000000-0000-0000-0000-001000000000
00000000-0000-0000-0000-010000000000
00000000-0000-0000-0000-100000000000
Code:
Alberto's code extended to denote that sorting is on the bytes and not on the individual bits.
With Test_UIDs As (-- 0 1 2 3 4 5 6 7 8 9 A B C D E F
Select ID = 1, UID = cast ('00000000-0000-0000-0000-100000000000' as uniqueidentifier)
Union Select ID = 2, UID = cast ('00000000-0000-0000-0000-010000000000' as uniqueidentifier)
Union Select ID = 3, UID = cast ('00000000-0000-0000-0000-001000000000' as uniqueidentifier)
Union Select ID = 4, UID = cast ('00000000-0000-0000-0000-000100000000' as uniqueidentifier)
Union Select ID = 5, UID = cast ('00000000-0000-0000-0000-000010000000' as uniqueidentifier)
Union Select ID = 6, UID = cast ('00000000-0000-0000-0000-000001000000' as uniqueidentifier)
Union Select ID = 7, UID = cast ('00000000-0000-0000-0000-000000100000' as uniqueidentifier)
Union Select ID = 8, UID = cast ('00000000-0000-0000-0000-000000010000' as uniqueidentifier)
Union Select ID = 9, UID = cast ('00000000-0000-0000-0000-000000001000' as uniqueidentifier)
Union Select ID = 10, UID = cast ('00000000-0000-0000-0000-000000000100' as uniqueidentifier)
Union Select ID = 11, UID = cast ('00000000-0000-0000-0000-000000000010' as uniqueidentifier)
Union Select ID = 12, UID = cast ('00000000-0000-0000-0000-000000000001' as uniqueidentifier)
Union Select ID = 13, UID = cast ('00000000-0000-0000-0001-000000000000' as uniqueidentifier)
Union Select ID = 14, UID = cast ('00000000-0000-0000-0010-000000000000' as uniqueidentifier)
Union Select ID = 15, UID = cast ('00000000-0000-0000-0100-000000000000' as uniqueidentifier)
Union Select ID = 16, UID = cast ('00000000-0000-0000-1000-000000000000' as uniqueidentifier)
Union Select ID = 17, UID = cast ('00000000-0000-0001-0000-000000000000' as uniqueidentifier)
Union Select ID = 18, UID = cast ('00000000-0000-0010-0000-000000000000' as uniqueidentifier)
Union Select ID = 19, UID = cast ('00000000-0000-0100-0000-000000000000' as uniqueidentifier)
Union Select ID = 20, UID = cast ('00000000-0000-1000-0000-000000000000' as uniqueidentifier)
Union Select ID = 21, UID = cast ('00000000-0001-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 22, UID = cast ('00000000-0010-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 23, UID = cast ('00000000-0100-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 24, UID = cast ('00000000-1000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 25, UID = cast ('00000001-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 26, UID = cast ('00000010-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 27, UID = cast ('00000100-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 28, UID = cast ('00001000-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 29, UID = cast ('00010000-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 30, UID = cast ('00100000-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 31, UID = cast ('01000000-0000-0000-0000-000000000000' as uniqueidentifier)
Union Select ID = 32, UID = cast ('10000000-0000-0000-0000-000000000000' as uniqueidentifier)
)
Select * From Test_UIDs Order By UID, ID
The algorithm is documented by the SQL Server guys here: How are GUIDs compared in SQL Server 2005? I Quote here here (since it's an old article that may be gone forever in a few years)
In general, equality comparisons make a lot of sense with
uniqueidentifier values. However, if you find yourself needing general
ordering, then you might be looking at the wrong data type and should
consider various integer types instead.
If, after careful thought, you decide to order on a uniqueidentifier
column, you might be surprised by what you get back.
Given these two uniqueidentifier values:
#g1= '55666BEE-B3A0-4BF5-81A7-86FF976E763F' #g2 =
'8DD5BCA5-6ABE-4F73-B4B7-393AE6BBB849'
Many people think that #g1 is less than #g2, since '55666BEE' is
certainly smaller than '8DD5BCA5'. However, this is not how SQL Server
2005 compares uniqueidentifier values.
The comparison is made by looking at byte "groups" right-to-left, and
left-to-right within a byte "group". A byte group is what is delimited
by the '-' character. More technically, we look at bytes {10 to 15}
first, then {8-9}, then {6-7}, then {4-5}, and lastly {0 to 3}.
In this specific example, we would start by comparing '86FF976E763F'
with '393AE6BBB849'. Immediately we see that #g2 is indeed greater
than #g1.
Note that in .NET languages, Guid values have a different default sort
order than in SQL Server. If you find the need to order an array or
list of Guid using SQL Server comparison semantics, you can use an
array or list of SqlGuid instead, which implements IComparable in a
way which is consistent with SQL Server semantics.
Plus, the sort follows byte groups endianness (see here: Globally unique identifier). The groups 10-15 and 8-9 are stored as big endian (corresponding to the Data4 in the wikipedia article), so they are compared as big endian. Other groups are compared using little endian.
A special service for those that find that the accepted answer a bit vague. The code speaks for itself; the magical parts are:
System.Guid g
g.ToByteArray();
int[] m_byteOrder = new int[16] // 16 Bytes = 128 Bit
{10, 11, 12, 13, 14, 15, 8, 9, 6, 7, 4, 5, 0, 1, 2, 3};
public int Compare(Guid x, Guid y)
{
byte byte1, byte2;
//Swap to the correct order to be compared
for (int i = 0; i < NUM_BYTES_IN_GUID; i++)
{
byte1 = x.ToByteArray()[m_byteOrder[i]];
byte2 = y.ToByteArray()[m_byteOrder[i]];
if (byte1 != byte2)
return (byte1 < byte2) ? (int)EComparison.LT : (int)EComparison.GT;
} // Next i
return (int)EComparison.EQ;
}
Full code:
namespace BlueMine.Data
{
public class SqlGuid
: System.IComparable
, System.IComparable<SqlGuid>
, System.Collections.Generic.IComparer<SqlGuid>
, System.IEquatable<SqlGuid>
{
private const int NUM_BYTES_IN_GUID = 16;
// Comparison orders.
private static readonly int[] m_byteOrder = new int[16] // 16 Bytes = 128 Bit
{10, 11, 12, 13, 14, 15, 8, 9, 6, 7, 4, 5, 0, 1, 2, 3};
private byte[] m_bytes; // the SqlGuid is null if m_value is null
public SqlGuid(byte[] guidBytes)
{
if (guidBytes == null || guidBytes.Length != NUM_BYTES_IN_GUID)
throw new System.ArgumentException("Invalid array size");
m_bytes = new byte[NUM_BYTES_IN_GUID];
guidBytes.CopyTo(m_bytes, 0);
}
public SqlGuid(System.Guid g)
{
m_bytes = g.ToByteArray();
}
public byte[] ToByteArray()
{
byte[] ret = new byte[NUM_BYTES_IN_GUID];
m_bytes.CopyTo(ret, 0);
return ret;
}
int CompareTo(object obj)
{
if (obj == null)
return 1; // https://msdn.microsoft.com/en-us/library/system.icomparable.compareto(v=vs.110).aspx
System.Type t = obj.GetType();
if (object.ReferenceEquals(t, typeof(System.DBNull)))
return 1;
if (object.ReferenceEquals(t, typeof(SqlGuid)))
{
SqlGuid ui = (SqlGuid)obj;
return this.Compare(this, ui);
} // End if (object.ReferenceEquals(t, typeof(UInt128)))
return 1;
} // End Function CompareTo(object obj)
int System.IComparable.CompareTo(object obj)
{
return this.CompareTo(obj);
}
int CompareTo(SqlGuid other)
{
return this.Compare(this, other);
}
int System.IComparable<SqlGuid>.CompareTo(SqlGuid other)
{
return this.Compare(this, other);
}
enum EComparison : int
{
LT = -1, // itemA precedes itemB in the sort order.
EQ = 0, // itemA occurs in the same position as itemB in the sort order.
GT = 1 // itemA follows itemB in the sort order.
}
public int Compare(SqlGuid x, SqlGuid y)
{
byte byte1, byte2;
//Swap to the correct order to be compared
for (int i = 0; i < NUM_BYTES_IN_GUID; i++)
{
byte1 = x.m_bytes[m_byteOrder[i]];
byte2 = y.m_bytes[m_byteOrder[i]];
if (byte1 != byte2)
return (byte1 < byte2) ? (int)EComparison.LT : (int)EComparison.GT;
} // Next i
return (int)EComparison.EQ;
}
int System.Collections.Generic.IComparer<SqlGuid>.Compare(SqlGuid x, SqlGuid y)
{
return this.Compare(x, y);
}
public bool Equals(SqlGuid other)
{
return Compare(this, other) == 0;
}
bool System.IEquatable<SqlGuid>.Equals(SqlGuid other)
{
return this.Equals(other);
}
}
}
Here's a different approach. The GUID is simply shuffled around ready for a normal string comparison like it occurs in SQL Server. This is Javascript but it is very easy to convert to any language.
function guidForComparison(guid) {
/*
character positions:
11111111112222222222333333
012345678901234567890123456789012345
00000000-0000-0000-0000-000000000000
byte positions:
111111111111
00112233 4455 6677 8899 001122334455
*/
return guid.substr(24, 12) +
guid.substr(19, 4) +
guid.substr(16, 2) +
guid.substr(14, 2) +
guid.substr(11, 2) +
guid.substr(9, 2) +
guid.substr(6, 2) +
guid.substr(4, 2) +
guid.substr(2, 2) +
guid.substr(0, 2);
};

Resources