How to fix the 'The columns in the computed data do not match the columns in the provided metadata' error? - pivot-table

I have a table of price updates in the format (timestamp, price, amount).
The timestamp is a datetime, price categorical and amount float64. The timestamp column is set as an index.
My goal is to get the amount available at each price level at each point in time.
First, I use the pivot to spread the prices into columns, and then forward fill.
pivot = price_table.pivot_table(index = 'timestamp',
columns = 'price', values = 'amount')
pivot_ffill = pivot.fillna(method = 'ffill')
I can compute or apply head to pivot_ffill and it works fine.
Clearly, there are still NAs at the beginning of the table where there have been no updates yet.
When I apply
pivot_nullfill = pivot_ffill.fillna(0)
pivot_nullfill.head()
I do get an error
The columns in the computed data do not match the columns in the provided metadata. I tried replacing the zero with 0.0 or float(0), but to no avail. As the previous steps work, I strongly suspect it has something to do with the fillna, but due to the delayed calculations that does not have to be true.
Does someone know what causes this? Thank you!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-180-f8ab344c7939> in <module>
----> 1 pivot_ffill.fillna(0).head()
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\dataframe\core.py in head(self, n, npartitions, compute)
896
897 if compute:
--> 898 result = result.compute()
899 return result
900
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
74 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
75 cache=cache, get_id=_thread_get_id,
---> 76 pack_exception=pack_exception, **kwargs)
77
78 # Cleanup pools associated to dead threads
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
460 _execute_task(task, data) # Re-execute locally
461 else:
--> 462 raise_exception(exc, tb)
463 res, worker_id = loads(res_info)
464 state['cache'][key] = res
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
110 if exc.__traceback__ is not tb:
111 raise exc.with_traceback(tb)
--> 112 raise exc
113
114 import pickle as cPickle
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
228 try:
229 task, data = loads(task_info)
--> 230 result = _execute_task(task, data)
231 id = get_id()
232 result = dumps((result, id))
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
116 elif istask(arg):
117 func, args = arg[0], arg[1:]
--> 118 args2 = [_execute_task(a, cache) for a in args]
119 return func(*args2)
120 elif not ishashable(arg):
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in <listcomp>(.0)
116 elif istask(arg):
117 func, args = arg[0], arg[1:]
--> 118 args2 = [_execute_task(a, cache) for a in args]
119 return func(*args2)
120 elif not ishashable(arg):
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\optimization.py in __call__(self, *args)
940 % (len(self.inkeys), len(args)))
941 return core.get(self.dsk, self.outkey,
--> 942 dict(zip(self.inkeys, args)))
943
944 def __reduce__(self):
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in get(dsk, out, cache)
147 for key in toposort(dsk):
148 task = dsk[key]
--> 149 result = _execute_task(task, cache)
150 cache[key] = result
151 result = _execute_task(out, cache)
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
91 def apply(func, args, kwargs=None):
92 if kwargs:
---> 93 return func(*args, **kwargs)
94 else:
95 return func(*args)
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\dataframe\core.py in apply_and_enforce(*args, **kwargs)
3800 if not np.array_equal(np.nan_to_num(meta.columns),
3801 np.nan_to_num(df.columns)):
-> 3802 raise ValueError("The columns in the computed data do not match"
3803 " the columns in the provided metadata")
3804 else:
ValueError: The columns in the computed data do not match the columns in the provided metadata

The error message should have give you a suggestion of how to fix the situation. We assume you are loading from CSV (the question doesn't say), so you would probably end up with a line like
df = dd.read_csv(..., dtype={...})
which instructs the pandas reader on the dtypes you want to enforce, since you know more information than pandas does. That ensures that all partitions have the same types for all columns - see the notes part of the docs.

Related

Python Impute using BayesianRidge() sklearn impute.IterativeImputer regression impute analysis value error

PROBLEM
Use interativeImputer from sklearn.impute.IterativeImputer, to get regression model fit for for BayesianRidge() for impute missing data in variable 'Frontage'.
After the interative_imputer_fit = interative_imputer.fit(data) run, the interative_imputer_fit.transform(X) runs but invoke on function, imputer_bay_ridge(data), the transform() function from interative_imputer, e.g., interative_imputer_fit.transform(X) error on value error. Passed in two variables, Frontage and Area. But only Frontage was inside the numpy.array.
Python CODE using sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
def imputer_bay_ridge(data):
data_array = data.to_numpy()
data_array.reshape(1, -1)
interative_imputer = IterativeImputer(BayesianRidge())
interative_imputer_fit = interative_imputer.fit(data_array)
X = data['LotFrontage']
data_imputed = interative_imputer_fit.transform(X)
train_data[['Frontage', 'Area']]
INVOKE FUNCTION
fit_tranformed_imputed = imputer_bay_ridge(train_data[['Frontage', 'Area']])
DATA EXAMPLE
train_data[['Frontage', 'Area']]
Frontage Area
0 65.0 8450
1 80.0 9600
2 68.0 11250
3 60.0 9550
4 84.0 14260
... ... ...
1455 62.0 7917
1456 85.0 13175
1457 66.0 9042
1458 68.0 9717
1459 75.0 9937
1460 rows × 2 columns
ERROR
ValueError Traceback (most recent call last)
Cell In[243], line 1
----> 1 fit_tranformed_imputed = imputer_bay_ridge(train_data[['LotFrontage', 'LotArea']])
Cell In[242], line 12, in imputer_bay_ridge(data)
10 interative_imputer_fit = interative_imputer.fit(data_array)
11 X = data['LotFrontage']
---> 12 data_imputed = interative_imputer_fit.transform(X)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/impute/_iterative.py:724, in IterativeImputer.transform(self, X)
707 """Impute all missing values in `X`.
708
709 Note that this is stochastic, and that if `random_state` is not fixed,
(...)
720 The imputed input data.
721 """
722 check_is_fitted(self)
--> 724 X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X)
726 X_indicator = super()._transform_indicator(complete_mask)
728 if self.n_iter_ == 0 or np.all(mask_missing_values):
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/impute/_iterative.py:514, in IterativeImputer._initial_imputation(self, X, in_fit)
511 else:
512 force_all_finite = True
--> 514 X = self._validate_data(
515 X,
516 dtype=FLOAT_DTYPES,
517 order="F",
518 reset=in_fit,
519 force_all_finite=force_all_finite,
520 )
521 _check_inputs_dtype(X, self.missing_values)
523 X_missing_mask = _get_mask(X, self.missing_values)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:566, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:769, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
767 # If input is 1D raise error
768 if array.ndim == 1:
--> 769 raise ValueError(
770 "Expected 2D array, got 1D array instead:\narray={}.\n"
771 "Reshape your data either using array.reshape(-1, 1) if "
772 "your data has a single feature or array.reshape(1, -1) "
773 "if it contains a single sample.".format(array)
774 )
776 # make sure we actually converted to numeric:
777 if dtype_numeric and array.dtype.kind in "OUSV":
ValueError: Expected 2D array, got 1D array instead:
array=[65. 80. 68. ... 66. 68. 75.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

Cython: ctypedef char array throws obscure error

Why does this work:
In [17]: %%cython -f
...: from libc.string cimport memcpy
...:
...: DEF KLEN = 5
...: DEF TRP_KLEN = KLEN * 3
...:
...: cdef:
...: unsigned char k[KLEN]
...: unsigned char kk[TRP_KLEN]
...: kk = bytearray(b'12345abcde!##$%')
...: memcpy(&k, &kk[5], KLEN)
...: print(k)
b'abcde'
While this:
In [16]: %%cython -f
...: from libc.string cimport memcpy
...:
...: DEF KLEN = 5
...: DEF TRP_KLEN = KLEN * 3
...: ctypedef unsigned char SingleKey[KLEN]
...: ctypedef unsigned char TripleKey[TRP_KLEN]
...:
...: cdef:
...: SingleKey k
...: TripleKey kk
...: kk = bytearray(b'12345abcde!##$%')
...: memcpy(&k, &kk[5], KLEN)
...: print(k)
throws an obscure error, which doesn't directly mention my code:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-16-a4bb608248b0> in <module>()
----> 1 get_ipython().run_cell_magic('cython', '-f', "from libc.string cimport memcpy\n\nDEF KLEN = 5\nDEF TRP_KLEN = KLEN * 3\nctypedef unsigned char SingleKey[KLEN]\nctypedef unsigned char DoubleKey[TRP_KLEN]\n\ncdef:\n SingleKey k[KLEN]\n DoubleKey kk[TRP_KLEN]\nkk = bytearray(b'12345abcde!##$%')\nmemcpy(&k, &kk[5], KLEN)\nprint(k)")
/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2165 magic_arg_s = self.var_expand(line, stack_depth)
2166 with self.builtin_trap:
-> 2167 result = fn(magic_arg_s, cell)
2168 return result
2169
<decorator-gen-118> in cython(self, line, cell)
/usr/lib/python3.6/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/IpythonMagic.py in cython(self, line, cell)
318 extension = None
319 if need_cythonize:
--> 320 extensions = self._cythonize(module_name, code, lib_dir, args, quiet=args.quiet)
321 assert len(extensions) == 1
322 extension = extensions[0]
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/IpythonMagic.py in _cythonize(self, module_name, code, lib_dir, args, quiet)
426 elif sys.version_info[0] >= 3:
427 opts['language_level'] = 3
--> 428 return cythonize([extension], **opts)
429 except CompileError:
430 return None
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/Dependencies.py in cythonize(module_list, exclude, nthreads, aliases, quiet, force, language, exclude_failures, **options)
1024 if not nthreads:
1025 for args in to_compile:
-> 1026 cythonize_one(*args)
1027
1028 if exclude_failures:
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/Dependencies.py in cythonize_one(pyx_file, c_file, fingerprint, quiet, options, raise_on_failure, embedded_metadata, full_module_name, progress)
1127 any_failures = 0
1128 try:
-> 1129 result = compile_single(pyx_file, options, full_module_name=full_module_name)
1130 if result.num_errors > 0:
1131 any_failures = 1
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Main.py in compile_single(source, options, full_module_name)
647 recursion.
648 """
--> 649 return run_pipeline(source, options, full_module_name)
650
651
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Main.py in run_pipeline(source, options, full_module_name, context)
497
498 context.setup_errors(options, result)
--> 499 err, enddata = Pipeline.run_pipeline(pipeline, source)
500 context.teardown_errors(err, options, result)
501 return result
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Pipeline.py in run_pipeline(pipeline, source, printtree)
352 exec("def %s(phase, data): return phase(data)" % phase_name, exec_ns)
353 run = _pipeline_entry_points[phase_name] = exec_ns[phase_name]
--> 354 data = run(phase, data)
355 if DebugFlags.debug_verbose_pipeline:
356 print(" %.3f seconds" % (time() - t))
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Pipeline.py in run(phase, data)
332
333 def run(phase, data):
--> 334 return phase(data)
335
336 error = None
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Pipeline.py in generate_pyx_code_stage(module_node)
50 def generate_pyx_code_stage_factory(options, result):
51 def generate_pyx_code_stage(module_node):
---> 52 module_node.process_implementation(options, result)
53 result.compilation_source = module_node.compilation_source
54 return result
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ModuleNode.py in process_implementation(self, options, result)
140 self.find_referenced_modules(env, self.referenced_modules, {})
141 self.sort_cdef_classes(env)
--> 142 self.generate_c_code(env, options, result)
143 self.generate_h_code(env, options, result)
144 self.generate_api_code(env, options, result)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ModuleNode.py in generate_c_code(self, env, options, result)
376 # generate normal variable and function definitions
377 self.generate_variable_definitions(env, code)
--> 378 self.body.generate_function_definitions(env, code)
379 code.mark_pos(None)
380 self.generate_typeobj_definitions(env, code)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_definitions(self, env, code)
436 #print "StatListNode.generate_function_definitions" ###
437 for stat in self.stats:
--> 438 stat.generate_function_definitions(env, code)
439
440 def generate_execution_code(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_definitions(self, env, code)
9242 entry.cname = cname
9243
-> 9244 self.node.generate_function_definitions(env, code)
9245
9246 def generate_execution_code(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_definitions(self, env, code)
1974 # ----- Function body -----
1975 # -------------------------
-> 1976 self.generate_function_body(env, code)
1977
1978 code.mark_pos(self.pos, trace=False)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_body(self, env, code)
1736
1737 def generate_function_body(self, env, code):
-> 1738 self.body.generate_execution_code(code)
1739
1740 def generate_function_definitions(self, env, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
442 for stat in self.stats:
443 code.mark_pos(stat.pos)
--> 444 stat.generate_execution_code(code)
445
446 def annotate(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
6185 for i, if_clause in enumerate(self.if_clauses):
6186 self._set_branch_hint(if_clause, if_clause.body)
-> 6187 if_clause.generate_execution_code(code, end_label, is_last=i == last)
6188 if self.else_clause:
6189 code.mark_pos(self.else_clause.pos)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code, end_label, is_last)
6247 self.condition.generate_disposal_code(code)
6248 self.condition.free_temps(code)
-> 6249 self.body.generate_execution_code(code)
6250 code.mark_pos(self.pos, trace=False)
6251 if not (is_last or self.body.is_terminator):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
442 for stat in self.stats:
443 code.mark_pos(stat.pos)
--> 444 stat.generate_execution_code(code)
445
446 def annotate(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/UtilNodes.py in generate_execution_code(self, code)
324 def generate_execution_code(self, code):
325 self.setup_temp_expr(code)
--> 326 self.body.generate_execution_code(code)
327 self.teardown_temp_expr(code)
328
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
6605 self.item.generate_evaluation_code(code)
6606 self.target.generate_assignment_code(self.item, code)
-> 6607 self.body.generate_execution_code(code)
6608 code.mark_pos(self.pos)
6609 code.put_label(code.continue_label)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
442 for stat in self.stats:
443 code.mark_pos(stat.pos)
--> 444 stat.generate_execution_code(code)
445
446 def annotate(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
5100 def generate_execution_code(self, code):
5101 code.mark_pos(self.pos)
-> 5102 self.generate_rhs_evaluation_code(code)
5103 self.generate_assignment_code(code)
5104
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_rhs_evaluation_code(self, code)
5387
5388 def generate_rhs_evaluation_code(self, code):
-> 5389 self.rhs.generate_evaluation_code(code)
5390
5391 def generate_assignment_code(self, code, overloaded_assignment=False):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ExprNodes.py in generate_evaluation_code(self, code)
718 self.allocate_temp_result(code)
719
--> 720 self.generate_result_code(code)
721 if self.is_temp and not (self.type.is_string or self.type.is_pyunicode_ptr):
722 # If we are temp we do not need to wait until this node is disposed
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ExprNodes.py in generate_result_code(self, code)
13135
13136 code.putln(self.type.from_py_call_code(
> 13137 self.arg.py_result(), self.result(), self.pos, code, from_py_function=from_py_function))
13138 if self.type.is_pyobject:
13139 code.put_gotref(self.py_result())
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/PyrexTypes.py in from_py_call_code(self, source_code, result_code, error_pos, code, from_py_function, error_condition)
512 source_code, result_code, error_pos, code,
513 from_py_function or self.from_py_function,
--> 514 error_condition or self.error_condition(result_code)
515 )
516
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/PyrexTypes.py in from_py_call_code(self, source_code, result_code, error_pos, code, from_py_function, error_condition)
2481 def from_py_call_code(self, source_code, result_code, error_pos, code,
2482 from_py_function=None, error_condition=None):
-> 2483 assert not error_condition, '%s: %s' % (error_pos, error_condition)
2484 call_code = "%s(%s, %s, %s)" % (
2485 from_py_function or self.from_py_function,
AssertionError: (<StringSourceDescriptor:carray.from_py>, 87, 19): (!__pyx_t_11) && PyErr_Occurred()
This is especially problematic in my 1000+-line program, where I had to find out what was causing the problem. Removing all ctypedefs for char arrays that were being assigned removed the error, but I'd like to know what is causing it.
Thanks.
UPDATE
Note: This is not a NUL-terminated string, but a byte sequence that may have NULs anywhere. That's why I am using memcpy rather than string functions or regular = assignments.

in R, selecting rows from mydataframe based upon array of values when array and mydataframe are unequal length

I have the following data called gg and yy.
> str(gg)
num [1:1992] 128 130 132 185 186 187 188 189 190 191 ...
> str(yy)
'data.frame': 2103 obs. of 2 variables:
$ grp : num 128 130 132 185 186 187 188 189 190 191 ...
$ predd: num -0.963 -1.518 1.712 -11.286 -8.195 ...
>
You'll notice that the first several values of gg match the first several from yy.
I would like to select rows from yy if the value yy$grp matches any value in gg. The issue is that gg and yy are of unequal length. Further, there are some values of gg that are not present in yy$grp and also some values of yy$grp not present in gg.
I can't seem to get this to work. It is basically an intersection of the two data sets based upon the index value I mentioned (gg, or yy$grp).
I've tried:
inters<-intersect(gg,yy$grp)
yyint<-yy[yy$grp==inters,]
but get the following
Warning message:
In yy$grp == inters :
longer object length is not a multiple of shorter object length
> str(yya)
'data.frame': 28 obs. of 2 variables:
$ grp : num 128 130 132 185 186 187 188 189 190 191 ...
$ predd: num -0.963 -1.518 1.712 -11.286 -8.195 ...
yya should be much longer, according to my plans at least.
Thanks.
As I mentioned, I think this is what you want:
yy[yy$grp %in% gg,]

CipherSaber bug

So I implemented ciphersaber-1. It almost works, I can decrypt the cstest1.cs1. But i have trouble getting cstest2.cs1 to work.
The output is:
The Fourth Amendment to the Constitution of the Unite ▀Stat→s of America
"The right o☻ the people to be secure in their persons, houses, papers, and
effects, against unreasonab→e searches an╚A)┤Xx¹▼☻dcðþÈ_#­0Uc.?n~J¿|,lómsó£k░7╠▄
íuVRÊ ╣├xð"↕(Gû┤.>!{³♫╚Tƒ}Àõ+»~C;ÔÙ²÷g.qÏø←1ß█yÎßsÈ÷g┐ÅJÔÞ┘Îö║AÝf╔ìêâß╗È;okn│CÚê
õ&æÄ[5&Þ½╔s╦Nå1En♂☻♫ôzÓ9»Á╝ÐÅ├ðzÝÎòeØ%W¶]¤▲´Oá╗e_Ú)╣ó0↑ï^☻P>ù♂­¥¯▄‗♦£mUzMצվ~8å
ì½³░Ùã♠,H-tßJ!³*²RóÅ
So I must have a bug in initializing the state. The odd thing is that I can encrypt and decrypt long texts without problems, so the bug is symmetric.
I implemented the rc4 cipher as a reentrent single byte algorithm as you can see in rc4.c.
The state is stored in the rc4_state struct:
typedef unsigned char rc4_byte;
struct rc4_state_
{
rc4_byte i;
rc4_byte j;
rc4_byte state[256];
};
typedef struct rc4_state_ rc4_state;
The state is initialized with rc4_init:
void rc4_init(rc4_state* state, rc4_byte* key, size_t keylen)
{
rc4_byte i, j, n;
i = 0;
do
{
state->state[i] = i;
i++;
}
while (i != 255);
j = 0;
i = 0;
do
{
n = i % keylen;
j += state->state[i] + key[n];
swap(&state->state[i], &state->state[j]);
i++;
}
while (i != 255);
state->i = 0;
state->j = 0;
}
The actual encryption / decryption is done in rc4:
rc4_byte rc4(rc4_state* state, rc4_byte in)
{
rc4_byte n;
state->i++;
state->j += state->state[state->i];
swap(&state->state[state->i], &state->state[state->j]);
n = state->state[state->i] + state->state[state->j];
return in ^ state->state[n];
}
For completeness, swap:
void swap(rc4_byte* a, rc4_byte* b)
{
rc4_byte t = *a;
*a = *b;
*b = t;
}
I have been breaking my head on this for more than two days... The state, at least for the "asdfg" key is correct. Any help would be nice.
The whole thing can be found in my github reopsitory: https://github.com/rioki/ciphersaber/
I stumbled across your question while searching online, but since you haven't updated your code at GitHub yet, I figured you might still like to know what the problem was.
It's in this bit of code:
i = 0;
do
{
state->state[i] = i;
i++;
}
while (i != 255);
After this loop has iterated 255 times, i will have a value of 255 and the loop will terminate. As a result, the last byte of your state buffer is being left uninitialised.
This is easily fixed. Just change while (i != 255); to while (i);.
Sorry you haven't gotten feedback, I finally pulled this off in Python 3 today, but don't know enough about C to debug your code.
Some of the links on the main ciphersaber page are broken (pointing to ".com" instead of ".org"), so you might not have found the FAQ:
http://ciphersaber.gurus.org/faq.html
It includes the following debugging tips:
Make sure you are not reading or writing encrypted files as text files. You must use binary mode for file I/O.
If you are writing in the C language, be sure to store bytes as unsigned char.
Watch out for classic indexing problems. Do arrays in you chosen programming language start with 0 or 1?
Make sure you are writing out a random 10 byte IV when you encrypt and are reading the IV from the start of the file when you decrypt.
If your program still does not work, put in some statements to print out the S array after the key setup step. Then run your program to
decrypt the file cstest1.cs1 using asdfg as the key. Here is how the S
array should look:
file: cstest1.cs1
key: asdfg
176 32 49 160 15 112 58 8 186 19 50 161 60 17 82 153 37 141 131 127 59
2 165 103 98 53 9 57 41 150 174 64 36 62 191 154 44 136 149 158 226
113 230 227 247 155 221 34 125 20 163 95 128 219 1 181 201 146 88 204
213 80 143 164 145 234 134 248 100 77 188 235 76 217 194 35 75 99 126
92 243 177 52 180 83 140 198 42 151 18 91 33 16 192 101 48 97 220 114
110 124 72 139 218 142 118 81 84 31 29 195 68 209 172 200 214 93 240
61 22 206 123 152 7 203 10 119 171 79 250 109 137 199 167 11 104 211
129 208 216 178 207 242 162 30 120 65 115 87 170 47 69 244 212 45 85
73 222 225 185 63 0 179 210 108 245 202 46 96 148 51 173 24 182 89 116
3 67 205 94 231 23 21 13 169 215 190 241 228 132 252 4 233 56 105 26
12 135 223 166 238 229 246 138 239 54 5 130 159 236 66 175 189 147 193
237 43 40 117 157 86 249 74 27 156 14 133 251 196 187 197 102 106 39
232 255 121 122 253 111 90 38 55 70 184 78 224 25 6 107 168 254 144 28
183 71
I also found the "memorable test cases" helpful here:
http://www.cypherspace.org/adam/csvec/
Including:
key="Al"+ct="Al Dakota buys"(iv="Al Dakota "):
pt = "mead"
Even though the memorable test cases require cs2, upgrading to cs2 from cs1 is fairly trivial, you may be able to confidently convert your program to cs2 from cs1 even without fully debugging the rest of it.
Also note that the FAQ claims there used to be a file on the site that wouldn't decode, make sure your target file doesn't begin with "0e e3 f9 b2 40 11 fc 3e ..."
(Though I think that was a smaller test file, not the certificate.)
Oh, and also know that the site's not really up to date on the latest research into RC4 and derivatives. Just reserve this as a toy program unless all else fails.
Python
Here's one I wrote in Python for a question that later got deleted. It processes the file as a stream so memory usage is modest.
Usage
python encrypt.py <key> <rounds> < <infile> > <outfile>
python decrypt.py <key> <rounds> < <infile> > <outfile>
rc4.py
#!/usr/bin/env python
# coding: utf-8
import psyco
from sys import stdin,stdout,argv
def rc4(K):
R=range(256)
S=R[:]
T=bytearray(K*256)[:256]
j=0
for i in R*int(argv[2]):
j=j+S[i]+T[i]&255
S[i],S[j]=S[j],S[i]
i=j=0
while True:
B=stdin.read(4096)
if not B: break
for c in B:
i+=1&255
j=j+S[i]&255
S[i],S[j]=S[j],S[i]
stdout.write(chr(ord(c)^S[S[i]+S[j]&255]))
psyco.bind(rc4)
encrypt.py
from rc4 import *
import os
V=os.urandom(10)
stdout.write(V)
rc4(argv[1]+V)
decrypt.py
from rc4 import *
V=stdin.read(10)
rc4(argv[1]+V)

FileNotOpenedError with Cloud storage on GAE

I am trying to write to a file in cloud storage from the remote api shell and am seeing the following:
s~appid> FILENAME = '/gs/test_bucket/test'
s~appid> writable_file = files.gs.create(FILENAME,
mime_type='application/octet-stream', acl='project-private')
s~appid> with files.open(writable_file, 'a') as f:
... f.write('[]')
...
---------------------------------------------------------------------------
FileNotOpenedError Traceback (most recent call last)
/Users/dhruvkaranmehta/Projects/getaround3/tools/g3/shell.pyc in <module>()
1 with files.open(writable_file, 'a') as f:
----> 2 f.write('[]')
3
/usr/local/google_appengine/google/appengine/api/files/file.pyc in
__exit__(self, atype, value, traceback)
288
289 def __exit__(self, atype, value, traceback):
--> 290 self.close()
291
292 def write(self, data, sequence_key=None):
/usr/local/google_appengine/google/appengine/api/files/file.pyc in
close(self, finalize)
282 request.set_filename(self._filename)
283 request.set_finalize(finalize)
--> 284 self._make_rpc_call_with_retry('Close', request, response)
285
286 def __enter__(self):
/usr/local/google_appengine/google/appengine/api/files/file.pyc in
_make_rpc_call_with_retry(self, method, request, response)
395 def _make_rpc_call_with_retry(self, method, request, response):
396 try:
--> 397 _make_call(method, request, response)
398 except (ApiTemporaryUnavailableError,
FileTemporaryUnavailableError):
399
/usr/local/google_appengine/google/appengine/api/files/file.pyc in
_make_call(method, request, response, deadline)
243 rpc.check_success()
244 except apiproxy_errors.ApplicationError, e:
--> 245 _raise_app_error(e)
246
247
/usr/local/google_appengine/google/appengine/api/files/file.pyc in
_raise_app_error(e)
186 elif (e.application_error ==
187 file_service_pb.FileServiceErrors.FILE_NOT_OPENED):
--> 188 raise FileNotOpenedError()
189 elif (e.application_error ==
190 file_service_pb.FileServiceErrors.READ_ONLY):
FileNotOpenedError:
This seems weird since the file was just opened. I have also seen another scenario where opening a file in 'a' mode leads to a FinalizationError.
Any additional information will be greatly helpful.
Thanks!
For the first part, there's a feature request to support the files api from the remote api shell. Could you try the same using the interactive console (See Is there an interactive console for public/uploaded app engine apps?).
Regarding your second error the documentation states that:
You cannot open and write to a file that has already been finalized.

Resources