What's the max length a cursorMark can have? - solr

As the title says I'd like to know the maximum length the cursorMark can have that I receive from Solr.
It would also be nice to get some info about chars that can be in it. But just the max length would already be nice. Does it even have one or can it theoretically grow without a limit?

Regarding the Set of Characters:
Looking at the Solr CursorMark source code, we can see that the representation of the cursor mark is a Base64 encoded String.
The specific implementation of Base64 used here is in Solr's Base64 utility class. Here we can see their character set is:
private static final char intToBase64[] = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
};
There may also be = symbols if strings are padded. But I don't recall seeing those.
Regarding the Length:
The size will vary depending on the specific data being encoded (sufficient to identify a sort spec/position).
So, based on that, I only have anecdotal observation, which is that the order of magnitude is bytes, not kilobytes.
Final note: This is all behind-the-scenes stuff - and, as such, may be subject to change without warning.

Related

libcaca - changing ascii glyphs to Katakana

I am creating a video effect that is supposed to look as in "Matrix" movie, but a bit different ("Matrix"-like video output will be mixed with an altered alpha channel with real video, so it will look half real, half with digits). I am using simply mplayer with caca driver (mplayer -vo caca video.mp4) together with screen recording and then mixing videos in other software. For this I needed to change "static uint32_t ascii_glyphs[]" array in file dither.c (from the code of the caca library as it published here: https://github.com/cacalabs/libcaca/blob/master/caca/dither.c) from: ' ', '.', ':', ';', 't', '%', 'S', 'X', '#', '8', '?' to contain all Katakana symbols. But the problem is that it looks like they are not printable. So the terminal output of the video contains only shadow blocks. I should say that the bash code:
str123="ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺヽヾヿㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ㌀㌁㌂㌃㌄㌅㌆㌇㌈㌉㌊㌋㌌㌍㌎㌏㌐㌑㌒㌓㌔㌕㌖㌗㌘㌙㌚㌛㌜㌝㌞㌟㌠㌡㌢㌣㌤㌥㌦㌧㌨㌩㌪㌫㌭㌮㌯㌰㌱㌲㌳㌴㌵㌶㌷㌸㌹㌺㌻㌼㌽㌾㌿㍀㍁㍂㍃㍄㍅㍆㍇㍈㍉㍊㍋㍌㍍㍎㍏㍐㍑㍒㍓㍔㍕㍖㍗ヲァィゥェォャュョッアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン"
for i in $(seq 0 ${#str123}); do echo -n "'${str123:i:1}',"; done
working correctly in my terminal (checked with couple of terminal programs, printing correctly), also locales are set:
$ locale
LANG=en_US.UTF-8
LANGUAGE=en_US
LC_CTYPE="en_US.UTF-8"
LC_NUMERIC=en_US.UTF-8
LC_TIME=en_US.UTF-8
LC_COLLATE="en_US.UTF-8"
LC_MONETARY=en_US.UTF-8
LC_MESSAGES="en_US.UTF-8"
LC_PAPER=en_US.UTF-8
LC_NAME=en_US.UTF-8
LC_ADDRESS=en_US.UTF-8
LC_TELEPHONE=en_US.UTF-8
LC_MEASUREMENT=en_US.UTF-8
LC_IDENTIFICATION=en_US.UTF-8
LC_ALL=
And the result for the new array:
/* List of glyphs */
static uint32_t ascii_glyphs[] =
{
/*
' ', '.', ':', ';', 't', '%', 'S', 'X', '#', '8', '?'
*/
/*
' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',',
'-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
':', ';', '<', '=', '>', '?', '#', 'A', 'B', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'{', '|', '}', '~'
*/
' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',',
'-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
':', ';', '<', '=', '>', '?','#',
'ァ','ア','ィ','イ','ゥ','ウ','ェ','エ','ォ','オ','カ','ガ','キ','ギ',
'ク','グ','ケ','ゲ','コ','ゴ','サ','ザ','シ','ジ','ス','ズ','セ','ゼ',
'ソ','ゾ','タ','ダ','チ','ヂ','ッ','ツ','ヅ','テ','デ','ト','ド','ナ',
'ニ','ヌ','ネ','ノ','ハ','バ','パ','ヒ','ビ','ピ','フ','ブ','プ','ヘ',
'ベ','ペ','ホ','ボ','ポ','マ','ミ','ム','メ','モ','ャ','ヤ','ュ','ユ',
'ョ','ヨ','ラ','リ','ル','レ','ロ','ヮ','ワ','ヰ','ヱ','ヲ','ン','ヴ',
'ヵ','ヶ','ヷ','ヸ','ヹ','ヺ','ヽ','ヾ','ヿ','ㇰ','ㇱ','ㇲ','ㇳ','ㇴ',
'ㇵ','ㇶ','ㇷ','ㇸ','ㇹ','ㇺ','ㇻ','ㇼ','ㇽ','ㇾ','ㇿ','㌀','㌁','㌂',
'㌃','㌄','㌅','㌆','㌇','㌈','㌉','㌊','㌋','㌌','㌍','㌎','㌏','㌐',
'㌑','㌒','㌓','㌔','㌕','㌖','㌗','㌘','㌙','㌚','㌛','㌜','㌝','㌞',
'㌟','㌠','㌡','㌢','㌣','㌤','㌥','㌦','㌧','㌨','㌩','㌪','㌫','㌭',
'㌮','㌯','㌰','㌱','㌲','㌳','㌴','㌵','㌶','㌷','㌸','㌹','㌺','㌻',
'㌼','㌽','㌾','㌿','㍀','㍁','㍂','㍃','㍄','㍅','㍆','㍇','㍈','㍉',
'㍊','㍋','㍌','㍍','㍎','㍏','㍐','㍑','㍒','㍓','㍔','㍕','㍖','㍗',
'[', '\\', ']', '^', '_', '`',
'ヲ','ァ','ィ','ゥ','ェ','ォ','ャ','ュ','ョ','ッ','ア','イ','ウ','エ','オ','カ','キ','ク',
'ケ','コ','サ','シ','ス','セ','ソ','タ','チ','ツ','テ','ト','ナ','ニ','ヌ','ネ','ノ','ハ',
'ヒ','フ','ヘ','ホ','マ','ミ','ム','メ','モ','ヤ','ユ','ヨ','ラ','リ','ル','レ','ロ','ワ',
'ン',
'{', '|', '}', '~'
};
is this:
For example, if I change this "static uint32_t ascii_glyphs[]" array to contain full ascii set, then the result is:
Update: I tried to change "static uint32_t ascii_glyphs[]" array to contain Katakana glyphs in Hexadecimal representation, still no result, but (!) if I add these multibyte characters to set:
static uint32_t ascii_glyphs[] =
{
/* CP437 and box drawing */
0x2591, 0x2592, 0x2593, 0x2588, 0x2584, 0x2580, /* ░ ▒ ▓ █ ▄ ▀ */
0x2500, 0x2501, 0x2503, 0x2503, 0x253c, 0x254b, /* ─ ━ │ ┃ ┼ ╋ */
0x252c, 0x2534, 0x2533, 0x253b, 0x2566, 0x2569, /* ┬ ┴ ┳ ┻ ╦ ╩ */
0x2550, 0x2551, 0x256c, /* ═ ║ ╬ */
0x2575, 0x2577, 0x2579, 0x257b
};
so those characters are printed correctly. Result:
But if I add Katakana in Hexadecimal:
static uint32_t ascii_glyphs[] =
{
/* CP437 and box drawing */
0x2591, 0x2592, 0x2593, 0x2588, 0x2584, 0x2580, /* ░ ▒ ▓ █ ▄ ▀ */
0x2500, 0x2501, 0x2503, 0x2503, 0x253c, 0x254b, /* ─ ━ │ ┃ ┼ ╋ */
0x252c, 0x2534, 0x2533, 0x253b, 0x2566, 0x2569, /* ┬ ┴ ┳ ┻ ╦ ╩ */
0x2550, 0x2551, 0x256c, /* ═ ║ ╬ */
0x2575, 0x2577, 0x2579, 0x257b,
/* Katakana (part) */
0x30a1,0x30a2,0x30a3,0x30a4,0x30a5,0x30a6,0x30a7,0x30a8,0x30a9,0x30aa,
0x30ab,0x30ac,0x30ad,0x30ae,0x30af,0x30b0,0x30b1,0x30b2,0x30b3,0x30b4
};
so many blanks (just background and shades chars, without glyphs) are added:
So why this is still not working? Looks like somehow the terminal (?), gcc (?) or something on the way just not liking Katakana symbols :)
Thank you for your guidance!
The problem is that hiragana and katakana are fullwidth characters. When Caca tries to write a character to the screen using caca_put_char(), it checks if there is already a fullwidth character on the screen, and if so, it will replace part of it with a space. Since all possible character positions on the screen are written to, it ends up overwriting any fullwidth character with a space, and thus in the end no katakana will be visible.
I think you would have to modify Caca to handle fullwidth characters in the dither character set. If all characters are fullwidth, it should just write only to even columns on the screen. If you have a mix, it will be more complex, but you could for example make it so that if there is already a fullwidth character on a given position, it will just not try to overwrite it.

Concatenate rows of two dimensional list elements in a list

I want to reorganize two-dimensional list elements in a list (here two elements):
[[['A','B','C'],
['G','H','I']],
[['D','E','F'],
['J','K','L']]]
to become:
[['A','B','C','D','E','F'],
['G','H','I','J','K','L']]
Is there a better way to write this, than the one expressed by the following function?
def joinTableColumns(tableColumns):
"""
fun([[['A','B','C'],
['G','H','I'] ],
[['D','E','F'],
['J', 'K', 'L']]]) --> [['A', 'B', 'C', 'D', 'E', 'F'],
['G', 'H', 'I', 'J', 'K', 'L']]
"""
tableData = []
for i,tcol in enumerate(tableColumns):
for j,line in enumerate(tcol):
if i == 0:
tableData.append(line)
else:
tableData[j]+=line
return tableData
Considering, that the number of rows to join is equal:
tdim_test = [(len(x), [len(y) for y in x][0] )for x in tableData]
len(list(set([x[0] for x in tdim_test])))==1
How can I increase robustness of that function? Or, is there something from a standard library that I should use instead?
Yes, you can use zip() function and itertools.chain() within a list comprehension:
In [17]: lst = [[['A','B','C'],
['G','H','I']],
[['D','E','F'],
['J','K','L']]]
In [18]: from itertools import chain
In [19]: [list(chain.from_iterable(i)) for i in zip(*lst)]
Out[19]: [['A', 'B', 'C', 'D', 'E', 'F'], ['G', 'H', 'I', 'J', 'K', 'L']]
Or as a pure functional approach you can use itertools.starmap() and operator.add():
In [22]: from itertools import starmap
In [23]: from operator import add
In [24]: list(starmap(add, zip(*lst)))
Out[24]: [['A', 'B', 'C', 'D', 'E', 'F'], ['G', 'H', 'I', 'J', 'K', 'L']]
import functools
[ functools.reduce(lambda x,y: x + y, i,[]) for i in zip(*matrix)]
This will give you what you want
You could just use the zip function, unpacking the table inside it and add the pairs:
table = [[['A','B','C'], ['G','H','I']],
[['D','E','F'], ['J','K','L']]]
res = [t1 + t2 for t1, t2 in zip(*table)]
which yields your wanted result:
[['A', 'B', 'C', 'D', 'E', 'F'], ['G', 'H', 'I', 'J', 'K', 'L']]

What's an efficient way to transform a two dimensional array of labels to a map from label to coordinates?

The following code works but does multiple passes over the entire array, which I would like to avoid. Another alternative would have been to sort the named_coords array by name and then gather the pieces while iterating through the sorted array, but I didn't find a clean way to make that work. Ideally the answer would use standard adapters and such to transform the collection as a whole.
use std::collections::HashMap;
fn main() {
let p = [ ['I', 'P', 'P', 'Y', 'Y', 'Y', 'Y', 'V', 'V', 'V']
, ['I', 'P', 'P', 'X', 'Y', 'L', 'L', 'L', 'L', 'V']
, ['I', 'P', 'X', 'X', 'X', 'F', 'Z', 'Z', 'L', 'V']
, ['I', 'T', 'W', 'X', 'F', 'F', 'F', 'Z', 'U', 'U']
, ['I', 'T', 'W', 'W', 'N', 'N', 'F', 'Z', 'Z', 'U']
, ['T', 'T', 'T', 'W', 'W', 'N', 'N', 'N', 'U', 'U']
];
// Gather named coordinates into a Vec
let mut named_coords = Vec::new();
for (n0, j0) in p.iter().enumerate() {
for (n1, j1) in j0.iter().enumerate() {
named_coords.push(((n0, n1), *j1));
}
}
// Transform the named coordinates into Vector of names.
let mut names = named_coords.iter().map(|x| x.1).collect::<Vec<_>>();
names.sort();
names.dedup();
// Filter the named coordinates by name and collect results.
// Inefficient - iterates over entire named_coords vector multiple times.
let mut pieces = HashMap::new();
for name in names {
pieces.insert(name, named_coords.iter().filter(|&p| p.1 == name).map(|p| p.0).collect::<Vec<_>>());
}
// Print out results.
for n in pieces.iter() {
for coord in n.1.iter() {
println!("{} {} {}", n.0, coord.0, coord.1);
}
}
}
Use the entry API:
use std::collections::HashMap;
fn main() {
let p = [['I', 'P', 'P', 'Y', 'Y', 'Y', 'Y', 'V', 'V', 'V'],
['I', 'P', 'P', 'X', 'Y', 'L', 'L', 'L', 'L', 'V'],
['I', 'P', 'X', 'X', 'X', 'F', 'Z', 'Z', 'L', 'V'],
['I', 'T', 'W', 'X', 'F', 'F', 'F', 'Z', 'U', 'U'],
['I', 'T', 'W', 'W', 'N', 'N', 'F', 'Z', 'Z', 'U'],
['T', 'T', 'T', 'W', 'W', 'N', 'N', 'N', 'U', 'U']];
let mut pieces = HashMap::new();
for (n0, j0) in p.iter().enumerate() {
for (n1, j1) in j0.iter().enumerate() {
pieces.entry(j1).or_insert_with(Vec::new).push((n0, n1));
}
}
println!("{:?}", pieces);
}
Efficient: A single pass through the data and a single hash lookup per item.
Simple: beauty is in the eye of the beholder.

zipped array returns as <zip object at 0x02B6F198>

I have two arrays: X = [1,2,3,4,5,3,8] and Y = ['S', 'S', 'S', 'S', 'S', 'C', 'C']. when i print the zipped array of this, it produces <zip object at 0x02B6F198>. The reason these two arrays are zipped is so I can sort Y corresponding to sorted(X) in the line
sortedY = [y for x,y in sorted(zip(X,Y))]
This line of code doesn't sort Y how I would want (sortedY = ['S','S','C','S','S','S','C']) but SortedX stays in the same arrangement as X.
I have a second program in which I use this code and it works fine but this program is significantly smaller in size than the original program.
If you are trying to print the zipped lists directly then that won't work. zipreturns an object and so when you try to print it you just get the object method. If you want to see it as a list, then apply an operation that returns a list.
X = [1,2,3,4,5,3,8]
Y = ['S', 'S', 'S', 'S', 'S', 'C', 'C']
# Some Simple Methods Include
print(list(zip(X, Y)))
print([i for i in zip(X, Y)])
# Output
[(1, 'S'), (2, 'S'), (3, 'S'), (4, 'S'), (5, 'S'), (3, 'C'), (8, 'C')]
Now I'm not sure what the issue was though, as what you provided should be working
sortedY = [y for x,y in sorted(zip(X,Y))]
print(sortedY)
# Output
['S', 'S', 'C', 'S', 'S', 'S', 'C']
As you can see it sorts Y corresponding to sorted X
print(sorted(zip(X,Y)))
#Output (X, Y)
[(1, 'S'), (2, 'S'), (3, 'C'), (3, 'S'), (4, 'S'), (5, 'S'), (8, 'C')]

How to get the sorted list from aws s3 bucket in an array

I’m trying to get the list of an object from aws s3 bucket using boto. This list is made out of common elements of two different list. I want this list to be sorted by "last_modified" of an object by ascending order, from S3 bucket. Meaning, I want the old object (based on the date) to be first on my list. So, I am trying to prepare list of an 5 elements like this. I want to take this list and process only those files that belong to this list and eventually delete those files and pickup the next list of 5 elements same way.
Here is the bucket hierarchy:-
//ship-my-data/outputs/444556677788.tar.gz
//ship-my-data/outputs/444556677788.tar.gz
//ship-my-data/outputs/345345345353.tar.gz
//ship-my-data/outputs1/ctrlFiles/ 444556677788.ctrl.tar.gz
//ship-my-data/outputs1/ctrlFiles/ 123222333444.ctrl.tar.gz
//ship-my-data/outputs1/ctrlFiles/ 769797977979.ctrl.tar.gz
I want to make a list of common elements from both the folder above i.e. from outputs1 & ctrlFiles folder.
Here is my code:
bucket = LogShip._aws_connection.get_bucket(aws_bucket_to_download) #Connecting to AWS s3 bucket
bucket_list_ctrl = bucket.list(prefix='outputs/ctrlFiles/', delimiter='/') #get the bucket list for control files.
ctrl_list = sorted(bucket_list_ctrl, key=lambda item1: item1.last_modified) # sort the list by last_modified date.
bucket_list_tar = bucket.list(prefix='outputs/', delimiter='/') #get the list for tar files.
tar_list = sorted(bucket_list_tar, key=lambda item2: item2.last_modified) #suppose to get the bucket list, but throwing an error #AttributeError: 'Prefix' object has no attribute 'last_modified'""
for item_c in ctrl_list:
ctrlName = str(item_c.name).split("/")[2].replace(".ctrl.tar.gz","") # cotrol file name: 1444447203130120001
for item_t in bucket_list_tar:
tarName = str(item_t.name).split("/")[1].replace(".tar.gz","") #tar file name: 1444447203130120001
#now from above two lists I want to prepare a master list of an common elements which is pick up only 5 elements to proceed further.
j = 5
while j <= 5:
for elem in ctrlName:
for elem in tarName:
master_list.append(elem)
j=j+1
print master_list
Output:
['c', 't', 'r', 'l', 'F', 'i', 'l', 'e', 's', 'c', 't', 'r', 'l', 'F', 'i', 'l', 'e', 's', 'c', 't', 'r', 'l', 'F', 'i', 'l', 'e', 's', 'c', 't', 'r', 'l', 'F', 'i', 'l', 'e', 's', 'c', 't', 'r', 'l', 'F', 'i', 'l', 'e', 's', 'c', 't', 'r', 'l', 'F']
Expected output:
[444556677788, 123222333444]
Can anyone please help me understand where I'm making mistake?
I'm not sure why you want to do things in groups of five, so this code matches all files at once:
import boto
import re
conn = boto.connect_s3('REGION')
bucket = conn.get_bucket('BUCKETNAME')
list = bucket.list()
# Get two lists of files
bucket_list_ctrl = bucket.list(prefix='outputs/ctrlFiles/', delimiter='/')
bucket_list_tar = bucket.list(prefix='outputs/', delimiter='/')
# Extract filenames and modified date
pattern = re.compile('.*?(\d+).*?')
ctrl_files = [(pattern.match(obj.name).group(1), obj.last_modified) for obj in bucket_list_ctrl]
list_files = [pattern.match(obj.name).group(1) for obj in bucket_list_tar if obj.name.endswith('gz')]
# Find filenames that match both
both = [obj for obj in ctrl_files if obj[0] in list_files]
# Give sorted result
result = [f[0] for f in sorted(both, key=lambda obj: obj[1])]

Resources