Cython: ctypedef char array throws obscure error - arrays

Why does this work:
In [17]: %%cython -f
...: from libc.string cimport memcpy
...:
...: DEF KLEN = 5
...: DEF TRP_KLEN = KLEN * 3
...:
...: cdef:
...: unsigned char k[KLEN]
...: unsigned char kk[TRP_KLEN]
...: kk = bytearray(b'12345abcde!##$%')
...: memcpy(&k, &kk[5], KLEN)
...: print(k)
b'abcde'
While this:
In [16]: %%cython -f
...: from libc.string cimport memcpy
...:
...: DEF KLEN = 5
...: DEF TRP_KLEN = KLEN * 3
...: ctypedef unsigned char SingleKey[KLEN]
...: ctypedef unsigned char TripleKey[TRP_KLEN]
...:
...: cdef:
...: SingleKey k
...: TripleKey kk
...: kk = bytearray(b'12345abcde!##$%')
...: memcpy(&k, &kk[5], KLEN)
...: print(k)
throws an obscure error, which doesn't directly mention my code:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-16-a4bb608248b0> in <module>()
----> 1 get_ipython().run_cell_magic('cython', '-f', "from libc.string cimport memcpy\n\nDEF KLEN = 5\nDEF TRP_KLEN = KLEN * 3\nctypedef unsigned char SingleKey[KLEN]\nctypedef unsigned char DoubleKey[TRP_KLEN]\n\ncdef:\n SingleKey k[KLEN]\n DoubleKey kk[TRP_KLEN]\nkk = bytearray(b'12345abcde!##$%')\nmemcpy(&k, &kk[5], KLEN)\nprint(k)")
/usr/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2165 magic_arg_s = self.var_expand(line, stack_depth)
2166 with self.builtin_trap:
-> 2167 result = fn(magic_arg_s, cell)
2168 return result
2169
<decorator-gen-118> in cython(self, line, cell)
/usr/lib/python3.6/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/IpythonMagic.py in cython(self, line, cell)
318 extension = None
319 if need_cythonize:
--> 320 extensions = self._cythonize(module_name, code, lib_dir, args, quiet=args.quiet)
321 assert len(extensions) == 1
322 extension = extensions[0]
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/IpythonMagic.py in _cythonize(self, module_name, code, lib_dir, args, quiet)
426 elif sys.version_info[0] >= 3:
427 opts['language_level'] = 3
--> 428 return cythonize([extension], **opts)
429 except CompileError:
430 return None
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/Dependencies.py in cythonize(module_list, exclude, nthreads, aliases, quiet, force, language, exclude_failures, **options)
1024 if not nthreads:
1025 for args in to_compile:
-> 1026 cythonize_one(*args)
1027
1028 if exclude_failures:
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Build/Dependencies.py in cythonize_one(pyx_file, c_file, fingerprint, quiet, options, raise_on_failure, embedded_metadata, full_module_name, progress)
1127 any_failures = 0
1128 try:
-> 1129 result = compile_single(pyx_file, options, full_module_name=full_module_name)
1130 if result.num_errors > 0:
1131 any_failures = 1
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Main.py in compile_single(source, options, full_module_name)
647 recursion.
648 """
--> 649 return run_pipeline(source, options, full_module_name)
650
651
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Main.py in run_pipeline(source, options, full_module_name, context)
497
498 context.setup_errors(options, result)
--> 499 err, enddata = Pipeline.run_pipeline(pipeline, source)
500 context.teardown_errors(err, options, result)
501 return result
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Pipeline.py in run_pipeline(pipeline, source, printtree)
352 exec("def %s(phase, data): return phase(data)" % phase_name, exec_ns)
353 run = _pipeline_entry_points[phase_name] = exec_ns[phase_name]
--> 354 data = run(phase, data)
355 if DebugFlags.debug_verbose_pipeline:
356 print(" %.3f seconds" % (time() - t))
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Pipeline.py in run(phase, data)
332
333 def run(phase, data):
--> 334 return phase(data)
335
336 error = None
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Pipeline.py in generate_pyx_code_stage(module_node)
50 def generate_pyx_code_stage_factory(options, result):
51 def generate_pyx_code_stage(module_node):
---> 52 module_node.process_implementation(options, result)
53 result.compilation_source = module_node.compilation_source
54 return result
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ModuleNode.py in process_implementation(self, options, result)
140 self.find_referenced_modules(env, self.referenced_modules, {})
141 self.sort_cdef_classes(env)
--> 142 self.generate_c_code(env, options, result)
143 self.generate_h_code(env, options, result)
144 self.generate_api_code(env, options, result)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ModuleNode.py in generate_c_code(self, env, options, result)
376 # generate normal variable and function definitions
377 self.generate_variable_definitions(env, code)
--> 378 self.body.generate_function_definitions(env, code)
379 code.mark_pos(None)
380 self.generate_typeobj_definitions(env, code)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_definitions(self, env, code)
436 #print "StatListNode.generate_function_definitions" ###
437 for stat in self.stats:
--> 438 stat.generate_function_definitions(env, code)
439
440 def generate_execution_code(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_definitions(self, env, code)
9242 entry.cname = cname
9243
-> 9244 self.node.generate_function_definitions(env, code)
9245
9246 def generate_execution_code(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_definitions(self, env, code)
1974 # ----- Function body -----
1975 # -------------------------
-> 1976 self.generate_function_body(env, code)
1977
1978 code.mark_pos(self.pos, trace=False)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_function_body(self, env, code)
1736
1737 def generate_function_body(self, env, code):
-> 1738 self.body.generate_execution_code(code)
1739
1740 def generate_function_definitions(self, env, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
442 for stat in self.stats:
443 code.mark_pos(stat.pos)
--> 444 stat.generate_execution_code(code)
445
446 def annotate(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
6185 for i, if_clause in enumerate(self.if_clauses):
6186 self._set_branch_hint(if_clause, if_clause.body)
-> 6187 if_clause.generate_execution_code(code, end_label, is_last=i == last)
6188 if self.else_clause:
6189 code.mark_pos(self.else_clause.pos)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code, end_label, is_last)
6247 self.condition.generate_disposal_code(code)
6248 self.condition.free_temps(code)
-> 6249 self.body.generate_execution_code(code)
6250 code.mark_pos(self.pos, trace=False)
6251 if not (is_last or self.body.is_terminator):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
442 for stat in self.stats:
443 code.mark_pos(stat.pos)
--> 444 stat.generate_execution_code(code)
445
446 def annotate(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/UtilNodes.py in generate_execution_code(self, code)
324 def generate_execution_code(self, code):
325 self.setup_temp_expr(code)
--> 326 self.body.generate_execution_code(code)
327 self.teardown_temp_expr(code)
328
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
6605 self.item.generate_evaluation_code(code)
6606 self.target.generate_assignment_code(self.item, code)
-> 6607 self.body.generate_execution_code(code)
6608 code.mark_pos(self.pos)
6609 code.put_label(code.continue_label)
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
442 for stat in self.stats:
443 code.mark_pos(stat.pos)
--> 444 stat.generate_execution_code(code)
445
446 def annotate(self, code):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_execution_code(self, code)
5100 def generate_execution_code(self, code):
5101 code.mark_pos(self.pos)
-> 5102 self.generate_rhs_evaluation_code(code)
5103 self.generate_assignment_code(code)
5104
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/Nodes.py in generate_rhs_evaluation_code(self, code)
5387
5388 def generate_rhs_evaluation_code(self, code):
-> 5389 self.rhs.generate_evaluation_code(code)
5390
5391 def generate_assignment_code(self, code, overloaded_assignment=False):
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ExprNodes.py in generate_evaluation_code(self, code)
718 self.allocate_temp_result(code)
719
--> 720 self.generate_result_code(code)
721 if self.is_temp and not (self.type.is_string or self.type.is_pyunicode_ptr):
722 # If we are temp we do not need to wait until this node is disposed
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/ExprNodes.py in generate_result_code(self, code)
13135
13136 code.putln(self.type.from_py_call_code(
> 13137 self.arg.py_result(), self.result(), self.pos, code, from_py_function=from_py_function))
13138 if self.type.is_pyobject:
13139 code.put_gotref(self.py_result())
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/PyrexTypes.py in from_py_call_code(self, source_code, result_code, error_pos, code, from_py_function, error_condition)
512 source_code, result_code, error_pos, code,
513 from_py_function or self.from_py_function,
--> 514 error_condition or self.error_condition(result_code)
515 )
516
~/code/lsup/virtualenv/lib/python3.6/site-packages/Cython/Compiler/PyrexTypes.py in from_py_call_code(self, source_code, result_code, error_pos, code, from_py_function, error_condition)
2481 def from_py_call_code(self, source_code, result_code, error_pos, code,
2482 from_py_function=None, error_condition=None):
-> 2483 assert not error_condition, '%s: %s' % (error_pos, error_condition)
2484 call_code = "%s(%s, %s, %s)" % (
2485 from_py_function or self.from_py_function,
AssertionError: (<StringSourceDescriptor:carray.from_py>, 87, 19): (!__pyx_t_11) && PyErr_Occurred()
This is especially problematic in my 1000+-line program, where I had to find out what was causing the problem. Removing all ctypedefs for char arrays that were being assigned removed the error, but I'd like to know what is causing it.
Thanks.
UPDATE
Note: This is not a NUL-terminated string, but a byte sequence that may have NULs anywhere. That's why I am using memcpy rather than string functions or regular = assignments.

Related

Error While loading Data set using seaborn library

import seaborn as sns
titanic=sns.load_dataset('titanic')
TimeoutError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.9/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1345 try:
-> 1346 h.request(req.get_method(), req.selector, req.data, headers,
1347 encode_chunked=req.has_header('Transfer-encoding'))
~/opt/anaconda3/lib/python3.9/http/client.py in request(self, method, url, body, headers, encode_chunked)1284 """Send a complete request to the server."""-> 1285 self._send_request(method, url, body, headers, encode_chunked)1286
~/opt/anaconda3/lib/python3.9/http/client.py in _send_request(self, method, url, body,
headers, encode_chunked)
1330 body = _encode(body, 'body')
1331 self.endheaders(body, encode_chunked=encode_chunked)
1332
~/opt/anaconda3/lib/python3.9/http/client.py in endheaders(self, message_body, encode_chunked)
1279 raise CannotSendHeader()
1280 self._send_output(message_body, encode_chunked=encode_chunked)
1281
~/opt/anaconda3/lib/python3.9/http/client.py in _send_output(self, message_body, encode_chunked)
1039 del self._buffer[:]
1040 self.send(msg)
1041
~/opt/anaconda3/lib/python3.9/http/client.py in send(self, data)
979 if self.auto_open:
980 self.connect()
981 else:
~/opt/anaconda3/lib/python3.9/http/client.py in connect(self)
1446
1447 super().connect()
1448
~/opt/anaconda3/lib/python3.9/http/client.py in connect(self)
945 """Connect to the host and port specified in init."""
946 self.sock = self._create_connection(
947 (self.host,self.port), self.timeout, self.source_address)
~/opt/anaconda3/lib/python3.9/socket.py in create_connection(address, timeout, source_address)
843 try:
844 raise err
845 finally:
~/opt/anaconda3/lib/python3.9/socket.py in create_connection(address, timeout, source_address)
831 sock.bind(source_address)
832 sock.connect(sa)
833 # Break explicitly a reference cycle
TimeoutError: [Errno 60] Operation timed out
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
/var/folders/7v/5_bfsn0x0xbdx1w5zzv68lwr0000gn/T/ipykernel_6934/3828376077.py in <module>
1 titanic=sns.load_dataset('titanic')
~/opt/anaconda3/lib/python3.9/site-packages/seaborn/utils.py in load_dataset(name, cache, data_home, **kws)
594 if name not in get_dataset_names():
595 raise ValueError(f"'{name}' is not one of the example datasets.")
596 urlretrieve(url, cache_path)
597 full_path = cache_path
598 else:
~/opt/anaconda3/lib/python3.9/urllib/request.py in urlretrieve(url, filename, reporthook, data)
237 url_type, path = _splittype(url)
238
239 with contextlib.closing(urlopen(url, data)) as fp:
240 headers = fp.info()
241
~/opt/anaconda3/lib/python3.9/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
212 else:
213 opener = _opener
214 return opener.open(url, data, timeout)
215
216 def install_opener(opener):
~/opt/anaconda3/lib/python3.9/urllib/request.py in open(self, fullurl, data, timeout)
515
516 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
517 response = self._open(req, data)
518
519 # post-process response
~/opt/anaconda3/lib/python3.9/urllib/request.py in _open(self, req, data)
532
533 protocol = req.type
534 result = self._call_chain(self.handle_open, protocol, protocol +
535 '_open', req)
536 if result:
~/opt/anaconda3/lib/python3.9/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
492 for handler in handlers:
493 func = getattr(handler, meth_name)
494 result = func(*args)
495 if result is not None:
496 return result
~/opt/anaconda3/lib/python3.9/urllib/request.py in https_open(self, req)
1387
1388 def https_open(self, req):
1389 return self.do_open(http.client.HTTPSConnection, req,
1390 context=self._context, check_hostname=self._check_hostname)
1391
~/opt/anaconda3/lib/python3.9/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1347 encode_chunked=req.has_header('Transfer-encoding'))
1348 except OSError as err: # timeout error
1349 raise URLError(err)
1350 r = h.getresponse()
1351 except:
URLError: <urlopen error [Errno 60] Operation timed out>
How do i fix this?
I have never faced an issue before. This is a new laptop. Mac to be precise. I previously only used Windows

Python Impute using BayesianRidge() sklearn impute.IterativeImputer regression impute analysis value error

PROBLEM
Use interativeImputer from sklearn.impute.IterativeImputer, to get regression model fit for for BayesianRidge() for impute missing data in variable 'Frontage'.
After the interative_imputer_fit = interative_imputer.fit(data) run, the interative_imputer_fit.transform(X) runs but invoke on function, imputer_bay_ridge(data), the transform() function from interative_imputer, e.g., interative_imputer_fit.transform(X) error on value error. Passed in two variables, Frontage and Area. But only Frontage was inside the numpy.array.
Python CODE using sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
def imputer_bay_ridge(data):
data_array = data.to_numpy()
data_array.reshape(1, -1)
interative_imputer = IterativeImputer(BayesianRidge())
interative_imputer_fit = interative_imputer.fit(data_array)
X = data['LotFrontage']
data_imputed = interative_imputer_fit.transform(X)
train_data[['Frontage', 'Area']]
INVOKE FUNCTION
fit_tranformed_imputed = imputer_bay_ridge(train_data[['Frontage', 'Area']])
DATA EXAMPLE
train_data[['Frontage', 'Area']]
Frontage Area
0 65.0 8450
1 80.0 9600
2 68.0 11250
3 60.0 9550
4 84.0 14260
... ... ...
1455 62.0 7917
1456 85.0 13175
1457 66.0 9042
1458 68.0 9717
1459 75.0 9937
1460 rows × 2 columns
ERROR
ValueError Traceback (most recent call last)
Cell In[243], line 1
----> 1 fit_tranformed_imputed = imputer_bay_ridge(train_data[['LotFrontage', 'LotArea']])
Cell In[242], line 12, in imputer_bay_ridge(data)
10 interative_imputer_fit = interative_imputer.fit(data_array)
11 X = data['LotFrontage']
---> 12 data_imputed = interative_imputer_fit.transform(X)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/impute/_iterative.py:724, in IterativeImputer.transform(self, X)
707 """Impute all missing values in `X`.
708
709 Note that this is stochastic, and that if `random_state` is not fixed,
(...)
720 The imputed input data.
721 """
722 check_is_fitted(self)
--> 724 X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X)
726 X_indicator = super()._transform_indicator(complete_mask)
728 if self.n_iter_ == 0 or np.all(mask_missing_values):
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/impute/_iterative.py:514, in IterativeImputer._initial_imputation(self, X, in_fit)
511 else:
512 force_all_finite = True
--> 514 X = self._validate_data(
515 X,
516 dtype=FLOAT_DTYPES,
517 order="F",
518 reset=in_fit,
519 force_all_finite=force_all_finite,
520 )
521 _check_inputs_dtype(X, self.missing_values)
523 X_missing_mask = _get_mask(X, self.missing_values)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:566, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:769, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
767 # If input is 1D raise error
768 if array.ndim == 1:
--> 769 raise ValueError(
770 "Expected 2D array, got 1D array instead:\narray={}.\n"
771 "Reshape your data either using array.reshape(-1, 1) if "
772 "your data has a single feature or array.reshape(1, -1) "
773 "if it contains a single sample.".format(array)
774 )
776 # make sure we actually converted to numeric:
777 if dtype_numeric and array.dtype.kind in "OUSV":
ValueError: Expected 2D array, got 1D array instead:
array=[65. 80. 68. ... 66. 68. 75.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

How to fix the 'The columns in the computed data do not match the columns in the provided metadata' error?

I have a table of price updates in the format (timestamp, price, amount).
The timestamp is a datetime, price categorical and amount float64. The timestamp column is set as an index.
My goal is to get the amount available at each price level at each point in time.
First, I use the pivot to spread the prices into columns, and then forward fill.
pivot = price_table.pivot_table(index = 'timestamp',
columns = 'price', values = 'amount')
pivot_ffill = pivot.fillna(method = 'ffill')
I can compute or apply head to pivot_ffill and it works fine.
Clearly, there are still NAs at the beginning of the table where there have been no updates yet.
When I apply
pivot_nullfill = pivot_ffill.fillna(0)
pivot_nullfill.head()
I do get an error
The columns in the computed data do not match the columns in the provided metadata. I tried replacing the zero with 0.0 or float(0), but to no avail. As the previous steps work, I strongly suspect it has something to do with the fillna, but due to the delayed calculations that does not have to be true.
Does someone know what causes this? Thank you!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-180-f8ab344c7939> in <module>
----> 1 pivot_ffill.fillna(0).head()
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\dataframe\core.py in head(self, n, npartitions, compute)
896
897 if compute:
--> 898 result = result.compute()
899 return result
900
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
74 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
75 cache=cache, get_id=_thread_get_id,
---> 76 pack_exception=pack_exception, **kwargs)
77
78 # Cleanup pools associated to dead threads
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
460 _execute_task(task, data) # Re-execute locally
461 else:
--> 462 raise_exception(exc, tb)
463 res, worker_id = loads(res_info)
464 state['cache'][key] = res
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
110 if exc.__traceback__ is not tb:
111 raise exc.with_traceback(tb)
--> 112 raise exc
113
114 import pickle as cPickle
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
228 try:
229 task, data = loads(task_info)
--> 230 result = _execute_task(task, data)
231 id = get_id()
232 result = dumps((result, id))
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
116 elif istask(arg):
117 func, args = arg[0], arg[1:]
--> 118 args2 = [_execute_task(a, cache) for a in args]
119 return func(*args2)
120 elif not ishashable(arg):
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in <listcomp>(.0)
116 elif istask(arg):
117 func, args = arg[0], arg[1:]
--> 118 args2 = [_execute_task(a, cache) for a in args]
119 return func(*args2)
120 elif not ishashable(arg):
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\optimization.py in __call__(self, *args)
940 % (len(self.inkeys), len(args)))
941 return core.get(self.dsk, self.outkey,
--> 942 dict(zip(self.inkeys, args)))
943
944 def __reduce__(self):
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in get(dsk, out, cache)
147 for key in toposort(dsk):
148 task = dsk[key]
--> 149 result = _execute_task(task, cache)
150 cache[key] = result
151 result = _execute_task(out, cache)
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
91 def apply(func, args, kwargs=None):
92 if kwargs:
---> 93 return func(*args, **kwargs)
94 else:
95 return func(*args)
C:\ProgramData\Anaconda3\envs\python36\lib\site-packages\dask\dataframe\core.py in apply_and_enforce(*args, **kwargs)
3800 if not np.array_equal(np.nan_to_num(meta.columns),
3801 np.nan_to_num(df.columns)):
-> 3802 raise ValueError("The columns in the computed data do not match"
3803 " the columns in the provided metadata")
3804 else:
ValueError: The columns in the computed data do not match the columns in the provided metadata
The error message should have give you a suggestion of how to fix the situation. We assume you are loading from CSV (the question doesn't say), so you would probably end up with a line like
df = dd.read_csv(..., dtype={...})
which instructs the pandas reader on the dtypes you want to enforce, since you know more information than pandas does. That ensures that all partitions have the same types for all columns - see the notes part of the docs.

Implementation of merge sort using threads and fork

Problem: I'm trying to implement Merge Sort in the following way, I have a Parent and two children. the first child will use the merge sort on his own, the second child will implement this the following way: create 2 threads, first one will sort the first half of the array, the second one will sort the rest. Then, after calling the merge sort, he will create again 2 threads for the first half, and 2 threads for the rest, and so on, until we end up in the base case and we finish. In the end, I want to check how much faster the second child implemented the merge sort than the first child.
My question: I've created 2 childs, the first child is implementing the sort merge and everything is fine. the second child - I was able to create only 2 threads, instead of much more (then 2 for each half, and so on), and in the end it neither prints the array nor the date of its finish.
This is the code for the second child:
if((id2 = fork()) == 0 && id1 != 0)
{
printf("Child2: \n");
ans1 = pthread_create ( &thread1 , NULL , mergeSort ,(arr3, (arr_size / 2) - 1 ,arr_size - 1 )) ;
ans2 = pthread_create ( &thread2 , NULL , mergeSort ,(arr3, 0, (arr_size / 2)- 1 )) ;
ans3 = pthread_create ( &thread3 , NULL , printArray ,(arr3, arr_size) ) ;
execl("/bin/date", "date",0);
if ( ans1 != 0 || ans2 != 0 || ans3 != 0) {
printf ( " \n can't create threads " ) ;
exit(0) ;
}
pthread_join ( thread1 , NULL ) ;
pthread_join ( thread2 , NULL ) ;
pthread_join ( thread3 , NULL ) ;
}
I'm using UNIX, and for compiling:
gcc -lpthread prog.c
for executing:
./a.out
This is the whole code:
/* C program for Merge Sort */
#include<stdlib.h>
#include<stdio.h>
#include <pthread.h>
#define N 100
// Merges two subarrays of arr[].
// First subarray is arr[l..m]
// Second subarray is arr[m+1..r]
void merge(int arr[], int l, int m, int r)
{
int i, j, k;
int n1 = m - l + 1;
int n2 = r - m;
/* create temp arrays */
int L[n1], R[n2];
/* Copy data to temp arrays L[] and R[] */
for (i = 0; i < n1; i++)
L[i] = arr[l + i];
for (j = 0; j < n2; j++)
R[j] = arr[m + 1+ j];
/* Merge the temp arrays back into arr[l..r]*/
i = 0; // Initial index of first subarray
j = 0; // Initial index of second subarray
k = l; // Initial index of merged subarray
while (i < n1 && j < n2)
{
if (L[i] <= R[j])
{
arr[k] = L[i];
i++;
}
else
{
arr[k] = R[j];
j++;
}
k++;
}
/* Copy the remaining elements of L[], if there
are any */
while (i < n1)
{
arr[k] = L[i];
i++;
k++;
}
/* Copy the remaining elements of R[], if there
are any */
while (j < n2)
{
arr[k] = R[j];
j++;
k++;
}
}
/* l is for left index and r is right index of the
sub-array of arr to be sorted */
void mergeSort(int arr[], int l, int r)
{
if (l < r)
{
// Same as (l+r)/2, but avoids overflow for
// large l and h
int m = l+(r-l)/2;
// Sort first and second halves
mergeSort(arr, l, m);
mergeSort(arr, m+1, r);
merge(arr, l, m, r);
}
}
/* UTILITY FUNCTIONS */
/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
printf("%d ", A[i]);
printf("\n");
}
/* Driver program to test above functions */
int main()
{
int min = -1000, max = 1000;
int arr[10], arr2[10], arr3[10];
int i,r;
int arr_size = sizeof(arr)/sizeof(arr[0]);
int id1,id2;
//Threads init
pthread_t thread1 , thread2, thread3;
int ans1, ans2, ans3;
for( i = 0; i < arr_size; i++){
r = rand() % (max - min + 1);
arr[i] = r;
arr2[i] = r;
arr3[i] = r;
}
//printf("Before: \n");
if((id1 = fork()) == 0)
{
printf("Child1: \n");
mergeSort(arr2, 0, arr_size - 1);
printArray(arr2, arr_size);
execl("/bin/date", "date",0);
}
if((id2 = fork()) == 0 && id1 != 0)
{
printf("Child2: \n");
ans1 = pthread_create ( &thread1 , NULL , mergeSort ,(arr3, (arr_size / 2) - 1 ,arr_size - 1 )) ;
ans2 = pthread_create ( &thread2 , NULL , mergeSort ,(arr3, 0, (arr_size / 2)- 1 )) ;
ans3 = pthread_create ( &thread3 , NULL , printArray ,(arr3, arr_size) ) ;
execl("/bin/date", "date",0);
if ( ans1 != 0 || ans2 != 0 || ans3 != 0) {
printf ( " \n can't create threads " ) ;
exit(0) ;
}
pthread_join ( thread1 , NULL ) ;
pthread_join ( thread2 , NULL ) ;
pthread_join ( thread3 , NULL ) ;
}
wait();
if(id1 != 0 && id2 != 0){
printf("Given array is \n");
printArray(arr, arr_size);
printf("Father:\n");
mergeSort(arr, 0, arr_size - 1);
printArray(arr, arr_size);
execl("/bin/date", "date",0);
printf("\nSorted array is \n");
//printf("After: \n");
}
return 0;
}
EDITED CODE:
/* C program for Merge Sort */
#include<stdlib.h>
#include<stdio.h>
#include <pthread.h>
#include <time.h>
#define N 100
// Merges two subarrays of arr[].
// First subarray is arr[l..m]
// Second subarray is arr[m+1..r]
void merge(int arr[], int l, int m, int r)
{
int i, j, k;
int n1 = m - l + 1;
int n2 = r - m;
/* create temp arrays */
int L[n1], R[n2];
/* Copy data to temp arrays L[] and R[] */
for (i = 0; i < n1; i++)
L[i] = arr[l + i];
for (j = 0; j < n2; j++)
R[j] = arr[m + 1+ j];
/* Merge the temp arrays back into arr[l..r]*/
i = 0; // Initial index of first subarray
j = 0; // Initial index of second subarray
k = l; // Initial index of merged subarray
while (i < n1 && j < n2)
{
if (L[i] <= R[j])
{
arr[k] = L[i];
i++;
}
else
{
arr[k] = R[j];
j++;
}
k++;
}
/* Copy the remaining elements of L[], if there
are any */
while (i < n1)
{
arr[k] = L[i];
i++;
k++;
}
/* Copy the remaining elements of R[], if there
are any */
while (j < n2)
{
arr[k] = R[j];
j++;
k++;
}
}
/* l is for left index and r is right index of the
sub-array of arr to be sorted */
void mergeSort(int arr[], int l, int r)
{
if (l < r)
{
// Same as (l+r)/2, but avoids overflow for
// large l and h
int m = l+(r-l)/2;
// Sort first and second halves
mergeSort(arr, l, m);
mergeSort(arr, m+1, r);
merge(arr, l, m, r);
}
}
void* mergeSort2(void* args)
{
int* newArgs = (int*)args;
int l = newArgs[1];
int r = newArgs[2];
pthread_t thread1 , thread2;
int ans1, ans2;
if (l < r)
{
// Same as (l+r)/2, but avoids overflow for
// large l and h
int m = (r+l)/2;
int newArgs1[3] = {newArgs[0], l, m};
int newArgs2[3] = {newArgs[0], m+1, r};
ans1 = pthread_create ( &thread1 , NULL , mergeSort2 ,(void*)newArgs1);
ans1 = pthread_create ( &thread2 , NULL , mergeSort2 ,(void*)newArgs2);
pthread_join(thread1,NULL);
pthread_join(thread2,NULL);
merge(newArgs[0], l, m, r);
}
}
/* UTILITY FUNCTIONS */
/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
printf("%d ", A[i]);
printf("\n");
}
static void print_timestamp(void)
{
time_t now = time(0);
struct tm *utc = gmtime(&now);
char iso8601[32];
strftime(iso8601, sizeof(iso8601), "%Y-%m-%d %H:%M:%S", utc);
printf("%s\n", iso8601);
}
/* Driver program to test above functions */
int main()
{
int min = -1000, max = 1000;
int arr[10], arr2[10], arr3[10];
int i,r;
int arr_size = sizeof(arr)/sizeof(arr[0]);
int id1,id2;
int args[3] ={arr3, 0, arr_size - 1};
struct timeval tvalBefore, tvalAfter;
struct timeval tvalBefore1, tvalAfter1;
//Threads init
pthread_t thread1;
int ans1;
srand(time(NULL));
for( i = 0; i < arr_size; i++){
r = rand() % (max - min + 1);
arr[i] = r;
arr2[i] = r;
arr3[i] = r;
}
//printf("Before: \n");
if((id1 = fork()) == 0)
{
gettimeofday (&tvalBefore, NULL);
//Operation to do
printf("Child1: \n");
mergeSort(arr2, 0, arr_size - 1);
printArray(arr2, arr_size);
print_timestamp();
gettimeofday (&tvalAfter, NULL);
// Changed format to long int (%ld), changed time calculation
printf("Time in microseconds for sorting CHILD 1: %ld microseconds\n",
((tvalAfter.tv_sec - tvalBefore.tv_sec)*1000000L
+tvalAfter.tv_usec) - tvalBefore.tv_usec
); // Added semicolon
}
else if((id2 = fork()) == 0)
{
printf("Child2: \n");
//Start Timer
gettimeofday (&tvalBefore1, NULL);
//Operation to do
ans1 = pthread_create ( &thread1 , NULL , mergeSort2 ,(void*)args);
pthread_join ( thread1 , NULL ) ;
print_timestamp();
gettimeofday (&tvalAfter1, NULL);
// Changed format to long int (%ld), changed time calculation
printf("Time in microseconds for sorting CHILD 2: %ld microseconds\n",
((tvalAfter1.tv_sec - tvalBefore1.tv_sec)*1000000L
+tvalAfter1.tv_usec) - tvalBefore1.tv_usec
); // Added semicolon
}
else{
wait();
wait();
gettimeofday (&tvalBefore, NULL);
//Operation to do
printf("Given array is \n");
printArray(arr, arr_size);
printf("Father:\n");
mergeSort(arr, 0, arr_size - 1);
printArray(arr, arr_size);
print_timestamp();
gettimeofday (&tvalAfter, NULL);
// Changed format to long int (%ld), changed time calculation
printf("Time in microseconds for sorting Father: %ld microseconds\n",
((tvalAfter.tv_sec - tvalBefore.tv_sec)*1000000L
+tvalAfter.tv_usec) - tvalBefore.tv_usec
); // Added semicolon
}
return 0;
}
You have several problems:
as noted in comments and Jonathan's answer, you call exec and replace your whole process image before your threads complete (and possibly before they actually start, since they may not have been given their first timeslice yet)
if you move that, you still have the problem that your printArray function was run in parallel to your sort threads, instead of afterwards
if you fix that, you still have the problem that your printArray thread was started improperly (with a likely invalid input pointer), for the same reason as for the sorting threads, described in more detail below
if you fix the printing, your sorting thread invocation is completely wrong (much detail follows below)
if you fix the thread invocation, your code still doesn't do what you claim you wanted: to keep starting new child threads for smaller and smaller sub-ranges of your input array
Let's start with the prototype of pthread_create, the declaration of your thread function, and the thread creation call:
int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
void *(*start_routine) (void *), void *arg);
this requires a function of shape void* start_routine(void *) as its third argument. However, you have
void mergeSort(int arr[], int l, int r) { ... }
which will nevertheless be called with only the first argument having a defined value. I'm amazed your compiler didn't warn about this.
Now, consider your the fourth argument to pthread_create in the following call:
ans1 = pthread_create(&thread1, NULL,
mergeSort,
(arr3, (arr_size / 2) - 1 ,arr_size - 1 )) ;
it takes the expression (arr3, (arr_size / 2) - 1 ,arr_size - 1 ). However, C doesn't have tuple types, and even if it did they wouldn't be convertible to void*. Instead this uses the comma operator , to discard the results of the first two expressions, and so you're actually using the integer value of arr_size - 1 as a pointer argument.
I'd expect it to crash when it tries to start the child thread - you didn't say how your program failed, but a SEGV would be common. You can catch these in a debugger, but it'll be somewhere inside the pthread library code, so it might not help much.
A sane solution for your problem would look something like this un-tested and never-compiled sample code:
/* use this for the fourth argument to pthread_create */
struct Range {
int *array;
int left;
int right;
pthread_t thread;
};
void mergeSortRange(Range *r) {
const int width = (right - left);
const int mid = left + (width/2);
if (width > THRESHOLD) {
/* wide enough to be worth a child thread */
Range left = { r->array, r->left, mid };
Range right = { r->array, mid+1, r->right };
pthread_create(&left.thread, NULL,
mergeSortRangeThreadFunction,
&left);
mergeSortRange(&right);
pthread_join(left.thread);
mergeSortedHalved(r->array, r->left, mid, r->right);
} else {
regularSingleThreadedMergeSort(r->array, r->left, r->right);
}
}
/* this is what you pass to pthread_create */
void* mergeSortRangeThreadFunction(void *data) {
Range *r = (Range *)data;
mergeSortRange(r);
return data;
}
although, even with THRESHOLD set to something good, it's better to use a thread pool than to start & stop threads repeatedly.
Finally, of course, you don't need to use recursion to start these threads and populate these Range structures - you could just create an array of size/THRESHOLD + 1 range descriptors, create one thread per core, and then figure out some logic for deciding when you're allowed to merge two consecutive ranges.
Program stops because of calls to execl()
You have:
…
ans3 = pthread_create ( &thread3 , NULL , printArray ,(arr3, arr_size) ) ;
execl("/bin/date", "date",0);
if ( ans1 != 0 || ans2 != 0 || ans3 != 0) {
…
The execl() replaces your process and all its threads with date, which produces its output and exits. You can't time-stamp your work like that!
You probably need to call time() or a higher-resolution timing mechanism, and then localtime() or gmtime() to create a broken-down time, and then strftime() to format it as you want, and finally printf() or similar to print the result. That all belongs in a function, of course, not in your code.
#include <stdio.h>
#include <time.h>
static void print_timestamp(void)
{
time_t now = time(0);
struct tm *utc = gmtime(&now);
char iso8601[32];
strftime(iso8601, sizeof(iso8601), "%Y-%m-%dT%H:%M:%S", utc);
printf("%s\n", iso8601);
}
Where you have execl(), call print_timestamp() instead.
Or, more simply, use system() instead of execl():
system("/bin/date");
This is a grotesquely heavyweight way of reporting the time, but it has the merit of simplicity.
Sub-second resolution times
I need to determine the time in milliseconds.
It depends on your platform, but on POSIX-ish systems you can use clock_gettime() or gettimeofday() to get sub-second timing.
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
static void print_timestamp(void) // UTC to seconds
{
time_t now = time(0);
struct tm *utc = gmtime(&now);
char iso8601[32];
strftime(iso8601, sizeof(iso8601), "%Y-%m-%dT%H:%M:%S", utc);
printf("%s\n", iso8601);
}
static void print_utc_ms(void) // UTC to milliseconds
{
struct timeval tv;
gettimeofday(&tv, 0);
struct tm *utc = gmtime(&tv.tv_sec);
char iso8601[32];
strftime(iso8601, sizeof(iso8601), "%Y-%m-%dT%H:%M:%S", utc);
printf("%s.%.3d\n", iso8601, tv.tv_usec / 1000);
}
static void print_local_us(void) // Local time to microseconds
{
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts); // CLOCK_MONOTONIC has merits too
struct tm *lmt = localtime(&ts.tv_sec);
char iso8601[32];
strftime(iso8601, sizeof(iso8601), "%Y-%m-%dT%H:%M:%S", lmt);
printf("%s.%.6ld\n", iso8601, ts.tv_nsec / 1000L);
}
int main(void)
{
print_timestamp();
print_utc_ms();
print_local_us();
return 0;
}
Example output:
2017-05-05T16:04:14
2017-05-05T16:04:14.268
2017-05-05T09:04:14.268975
NB: Once you've fixed your code so it isn't using execl(), there may still be other problems to resolve — there probably are other problems to fix. But fixing this is a key step to getting your threads to run to completion.
Creating working code
Taking the revised code from the question, applying basic 'cleanliness' to it (making sure it compiles cleanly under stringent warning options), the program seems to work. The 'array of int' approach to passing a pointer and two int values doesn't work on a 64-bit system, so I created a struct Sort to contain the information. I also moved the 'start clock' and 'stop clock' calls to gettimeofday() closer to the code being measured (no printing in the calling code in the way). I added headers needed on macOS Sierra 10.12.4 (GCC 7.1.0). The code also prints the input data before it sorts any of it. The cleanup work was basically 'around' the sort code; the core sorting algorithms were not changed at all.
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <time.h>
#include <sys/time.h> // gettimeofday()
#include <unistd.h> // fork()
#include <sys/wait.h> // wait()
#define N 100
struct Sort
{
int *data;
int lo;
int hi;
};
// Merges two subarrays of arr[].
// First subarray is arr[l..m]
// Second subarray is arr[m+1..r]
static
void merge(int arr[], int l, int m, int r)
{
int i, j, k;
int n1 = m - l + 1;
int n2 = r - m;
/* create temp arrays */
int L[n1], R[n2];
/* Copy data to temp arrays L[] and R[] */
for (i = 0; i < n1; i++)
L[i] = arr[l + i];
for (j = 0; j < n2; j++)
R[j] = arr[m + 1 + j];
/* Merge the temp arrays back into arr[l..r]*/
i = 0; // Initial index of first subarray
j = 0; // Initial index of second subarray
k = l; // Initial index of merged subarray
while (i < n1 && j < n2)
{
if (L[i] <= R[j])
{
arr[k] = L[i];
i++;
}
else
{
arr[k] = R[j];
j++;
}
k++;
}
/* Copy the remaining elements of L[], if there
are any */
while (i < n1)
{
arr[k] = L[i];
i++;
k++;
}
/* Copy the remaining elements of R[], if there
are any */
while (j < n2)
{
arr[k] = R[j];
j++;
k++;
}
}
/* l is for left index and r is right index of the
sub-array of arr to be sorted */
static
void mergeSort(int arr[], int l, int r)
{
if (l < r)
{
// Same as (l+r)/2, but avoids overflow for
// large l and h
int m = l + (r - l) / 2;
// Sort first and second halves
mergeSort(arr, l, m);
mergeSort(arr, m + 1, r);
merge(arr, l, m, r);
}
}
static
void *mergeSort2(void *args)
{
struct Sort *newargs = args;
int *data = newargs->data;
int l = newargs->lo;
int r = newargs->hi;
pthread_t thread1, thread2;
int ans1, ans2;
if (l < r)
{
int m = (r + l) / 2;
struct Sort newArgs1 = {data, l, m};
struct Sort newArgs2 = {data, m + 1, r};
ans1 = pthread_create(&thread1, NULL, mergeSort2, &newArgs1);
ans2 = pthread_create(&thread2, NULL, mergeSort2, &newArgs2);
if (ans1 != 0 || ans2 != 0)
exit(1);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
merge(data, l, m, r);
}
return 0;
}
/* UTILITY FUNCTIONS */
/* Function to print an array */
static
void printArray(int A[], int size)
{
for (int i = 0; i < size; i++)
printf("%d ", A[i]);
printf("\n");
}
static void print_timestamp(void)
{
time_t now = time(0);
struct tm *utc = gmtime(&now);
char iso8601[32];
strftime(iso8601, sizeof(iso8601), "%Y-%m-%d %H:%M:%S", utc);
printf("%s\n", iso8601);
}
/* Driver program to test above functions */
int main(void)
{
int min = -1000, max = 1000;
int arr[10], arr2[10], arr3[10];
int i, r;
int arr_size = sizeof(arr) / sizeof(arr[0]);
int id1, id2;
struct Sort args = { arr3, 0, arr_size - 1};
struct timeval tvalBefore, tvalAfter;
struct timeval tvalBefore1, tvalAfter1;
// Threads init
pthread_t thread1;
int ans1;
srand(time(NULL));
for (i = 0; i < arr_size; i++)
{
r = rand() % (max - min + 1);
arr[i] = r;
arr2[i] = r;
arr3[i] = r;
}
printf("Given array is \n");
printArray(arr, arr_size);
fflush(stdout);
if ((id1 = fork()) == 0)
{
printf("Child1: \n");
gettimeofday(&tvalBefore, NULL);
mergeSort(arr2, 0, arr_size - 1);
gettimeofday(&tvalAfter, NULL);
printArray(arr2, arr_size);
print_timestamp();
printf("Time in microseconds for sorting CHILD 1: %ld microseconds\n",
((tvalAfter.tv_sec - tvalBefore.tv_sec) * 1000000L
+ tvalAfter.tv_usec) - tvalBefore.tv_usec);
}
else if ((id2 = fork()) == 0)
{
printf("Child2: \n");
gettimeofday(&tvalBefore1, NULL);
ans1 = pthread_create(&thread1, NULL, mergeSort2, &args);
if (ans1 == 0)
pthread_join( thread1, NULL );
gettimeofday(&tvalAfter1, NULL);
print_timestamp();
printArray(arr3, arr_size);
printf("Time in microseconds for sorting CHILD 2: %ld microseconds\n",
((tvalAfter1.tv_sec - tvalBefore1.tv_sec) * 1000000L
+ tvalAfter1.tv_usec) - tvalBefore1.tv_usec);
}
else
{
wait(0);
wait(0);
printf("Parent:\n");
gettimeofday(&tvalBefore, NULL);
mergeSort(arr, 0, arr_size - 1);
gettimeofday(&tvalAfter, NULL);
printArray(arr, arr_size);
print_timestamp();
printf("Time in microseconds for sorting Parent: %ld microseconds\n",
((tvalAfter.tv_sec - tvalBefore.tv_sec) * 1000000L
+ tvalAfter.tv_usec) - tvalBefore.tv_usec);
}
return 0;
}
Compilation (source in ms83.c):
$ gcc -O3 -g -std=c11 -Wall -Wextra -Werror -Wmissing-prototypes \
> -Wstrict-prototypes -Wold-style-definition ms83.c -o ms83
$
Example run 1:
Given array is
574 494 441 870 1121 800 1864 1819 889 242
Child1:
242 441 494 574 800 870 889 1121 1819 1864
2017-05-05 21:31:23
Time in microseconds for sorting CHILD 1: 10 microseconds
Child2:
2017-05-05 21:31:23
242 441 494 574 800 870 889 1121 1819 1864
Time in microseconds for sorting CHILD 2: 3260 microseconds
Parent:
242 441 494 574 800 870 889 1121 1819 1864
2017-05-05 21:31:23
Time in microseconds for sorting Parent: 7 microseconds
Example run 2:
Given array is
150 562 748 1685 889 1859 1807 1904 863 1675
Child1:
150 562 748 863 889 1675 1685 1807 1859 1904
2017-05-05 21:31:40
Time in microseconds for sorting CHILD 1: 11 microseconds
Child2:
2017-05-05 21:31:40
150 562 748 863 889 1675 1685 1807 1859 1904
Time in microseconds for sorting CHILD 2: 4745 microseconds
Parent:
150 562 748 863 889 1675 1685 1807 1859 1904
2017-05-05 21:31:40
Time in microseconds for sorting Parent: 7 microseconds
Note that the threading solution is three orders of magnitude slower than the non-threading code.
When I tried increasing the array size from 10 to 10,000, the threaded child did not complete. That means thread creation failed somewhere. The error reporting is defective (I was being lazy). Switching to 500 entries yielded:
Given array is
1984 1436 713 1349 855 1296 559 1647 567 1153 1156 1395 865 1380 840 1253 714 1396 333 404 538 1468 1381 489 1274 34 697 1484 1742 756 1221 1717 331 532 746 842 1235 1179 1185 1547 1372 1305 138 404 76 762 605 61 1242 1075 1896 203 1173 844 1582 1356 1044 1760 1635 1833 1595 1651 1892 1842 1508 727 357 221 878 967 1665 1783 1927 1655 1110 220 711 371 1785 401 188 1132 1947 1214 5 1414 1065 730 826 807 1155 654 1745 1993 1215 741 1721 1509 604 16 139 804 1773 690 1673 861 1657 566 969 1891 1718 1801 200 1817 235 711 372 319 507 483 1332 968 1138 246 1082 1074 1569 1774 488 358 1713 350 583 381 418 300 1011 416 563 748 1858 837 1678 1336 1516 1177 1449 1664 1991 1465 1159 1653 1724 311 1360 902 1182 1768 1471 1606 1813 1925 825 122 1647 1790 1575 323 153 33 1825 1343 1183 1707 1724 1839 1190 1936 442 1370 206 1530 1142 561 952 478 25 1666 382 1092 418 720 1864 652 313 1878 1268 993 1446 1881 893 1416 319 577 1147 688 1155 726 1336 1354 1419 217 1236 213 1715 101 946 1450 135 297 1962 1405 455 924 26 569 755 64 1459 1636 395 1417 138 924 1360 893 1216 1231 1546 1104 252 697 1602 1794 1565 1945 1738 941 1813 1829 714 280 369 1861 1466 1195 1284 1936 78 1988 145 1541 1927 833 135 913 1214 405 23 1107 390 242 309 964 1311 724 284 342 1550 1394 759 1860 28 1369 1417 362 747 1732 26 1791 646 1817 1392 666 762 1297 945 507 58 928 1972 811 170 1660 1811 1969 573 242 1297 74 581 1513 1258 1311 547 627 942 1965 945 343 1633 197 843 249 77 320 611 1674 303 1346 148 533 1800 259 916 1498 1058 365 973 451 1143 1121 1033 126 595 726 1232 894 1584 878 1076 1796 257 531 144 740 1033 630 471 919 773 1276 1523 1195 475 667 40 91 1336 350 1650 970 1712 542 1927 168 1107 917 1271 649 1006 1428 20 1341 1283 774 1781 1427 1342 316 1317 1162 1333 991 1288 1853 1917 210 1589 1744 1942 962 557 1444 396 1330 378 625 1776 179 434 290 870 961 1365 226 605 1842 1629 1421 1883 108 102 1068 671 1086 692 1053 45 660 1746 1351 399 1308 833 42 1219 491 248 503 499 3 1965 1043 1452 604 1736 1974 675 14 1491 1757 1116 1520 1540 983 108 15 1030 742 1535 423 1802 1622 1401 1801 167 824 230 404 1722 814 1222 1626 1177 1772 1645 27 1061 1848 1031 1659 1725 1862 959 362 728 1644 957 934 1160 1862 915 995 1201 119 1191 259 963 1889
Child1:
3 5 14 15 16 20 23 25 26 26 27 28 33 34 40 42 45 58 61 64 74 76 77 78 91 101 102 108 108 119 122 126 135 135 138 138 139 144 145 148 153 167 168 170 179 188 197 200 203 206 210 213 217 220 221 226 230 235 242 242 246 248 249 252 257 259 259 280 284 290 297 300 303 309 311 313 316 319 319 320 323 331 333 342 343 350 350 357 358 362 362 365 369 371 372 378 381 382 390 395 396 399 401 404 404 404 405 416 418 418 423 434 442 451 455 471 475 478 483 488 489 491 499 503 507 507 531 532 533 538 542 547 557 559 561 563 566 567 569 573 577 581 583 595 604 604 605 605 611 625 627 630 646 649 652 654 660 666 667 671 675 688 690 692 697 697 711 711 713 714 714 720 724 726 726 727 728 730 740 741 742 746 747 748 755 756 759 762 762 773 774 804 807 811 814 824 825 826 833 833 837 840 842 843 844 855 861 865 870 878 878 893 893 894 902 913 915 916 917 919 924 924 928 934 941 942 945 945 946 952 957 959 961 962 963 964 967 968 969 970 973 983 991 993 995 1006 1011 1030 1031 1033 1033 1043 1044 1053 1058 1061 1065 1068 1074 1075 1076 1082 1086 1092 1104 1107 1107 1110 1116 1121 1132 1138 1142 1143 1147 1153 1155 1155 1156 1159 1160 1162 1173 1177 1177 1179 1182 1183 1185 1190 1191 1195 1195 1201 1214 1214 1215 1216 1219 1221 1222 1231 1232 1235 1236 1242 1253 1258 1268 1271 1274 1276 1283 1284 1288 1296 1297 1297 1305 1308 1311 1311 1317 1330 1332 1333 1336 1336 1336 1341 1342 1343 1346 1349 1351 1354 1356 1360 1360 1365 1369 1370 1372 1380 1381 1392 1394 1395 1396 1401 1405 1414 1416 1417 1417 1419 1421 1427 1428 1436 1444 1446 1449 1450 1452 1459 1465 1466 1468 1471 1484 1491 1498 1508 1509 1513 1516 1520 1523 1530 1535 1540 1541 1546 1547 1550 1565 1569 1575 1582 1584 1589 1595 1602 1606 1622 1626 1629 1633 1635 1636 1644 1645 1647 1647 1650 1651 1653 1655 1657 1659 1660 1664 1665 1666 1673 1674 1678 1707 1712 1713 1715 1717 1718 1721 1722 1724 1724 1725 1732 1736 1738 1742 1744 1745 1746 1757 1760 1768 1772 1773 1774 1776 1781 1783 1785 1790 1791 1794 1796 1800 1801 1801 1802 1811 1813 1813 1817 1817 1825 1829 1833 1839 1842 1842 1848 1853 1858 1860 1861 1862 1862 1864 1878 1881 1883 1889 1891 1892 1896 1917 1925 1927 1927 1927 1936 1936 1942 1945 1947 1962 1965 1965 1969 1972 1974 1984 1988 1991 1993
2017-05-05 21:43:11
Time in microseconds for sorting CHILD 1: 62 microseconds
Child2:
2017-05-05 21:43:11
3 5 14 15 16 20 23 25 26 26 27 28 33 34 40 42 45 58 61 64 74 76 77 78 91 101 102 108 108 119 122 126 135 135 138 138 139 144 145 148 153 167 168 170 179 188 197 200 203 206 210 213 217 220 221 226 230 235 242 242 246 248 249 252 257 259 259 280 284 290 297 300 303 309 311 313 316 319 319 320 323 331 333 342 343 350 350 357 358 362 362 365 369 371 372 378 381 382 390 395 396 399 401 404 404 404 405 416 418 418 423 434 442 451 455 471 475 478 483 488 489 491 499 503 507 507 531 532 533 538 542 547 557 559 561 563 566 567 569 573 577 581 583 595 604 604 605 605 611 625 627 630 646 649 652 654 660 666 667 671 675 688 690 692 697 697 711 711 713 714 714 720 724 726 726 727 728 730 740 741 742 746 747 748 755 756 759 762 762 773 774 804 807 811 814 824 825 826 833 833 837 840 842 843 844 855 861 865 870 878 878 893 893 894 902 913 915 916 917 919 924 924 928 934 941 942 945 945 946 952 957 959 961 962 963 964 967 968 969 970 973 983 991 993 995 1006 1011 1030 1031 1033 1033 1043 1044 1053 1058 1061 1065 1068 1074 1075 1076 1082 1086 1092 1104 1107 1107 1110 1116 1121 1132 1138 1142 1143 1147 1153 1155 1155 1156 1159 1160 1162 1173 1177 1177 1179 1182 1183 1185 1190 1191 1195 1195 1201 1214 1214 1215 1216 1219 1221 1222 1231 1232 1235 1236 1242 1253 1258 1268 1271 1274 1276 1283 1284 1288 1296 1297 1297 1305 1308 1311 1311 1317 1330 1332 1333 1336 1336 1336 1341 1342 1343 1346 1349 1351 1354 1356 1360 1360 1365 1369 1370 1372 1380 1381 1392 1394 1395 1396 1401 1405 1414 1416 1417 1417 1419 1421 1427 1428 1436 1444 1446 1449 1450 1452 1459 1465 1466 1468 1471 1484 1491 1498 1508 1509 1513 1516 1520 1523 1530 1535 1540 1541 1546 1547 1550 1565 1569 1575 1582 1584 1589 1595 1602 1606 1622 1626 1629 1633 1635 1636 1644 1645 1647 1647 1650 1651 1653 1655 1657 1659 1660 1664 1665 1666 1673 1674 1678 1707 1712 1713 1715 1717 1718 1721 1722 1724 1724 1725 1732 1736 1738 1742 1744 1745 1746 1757 1760 1768 1772 1773 1774 1776 1781 1783 1785 1790 1791 1794 1796 1800 1801 1801 1802 1811 1813 1813 1817 1817 1825 1829 1833 1839 1842 1842 1848 1853 1858 1860 1861 1862 1862 1864 1878 1881 1883 1889 1891 1892 1896 1917 1925 1927 1927 1927 1936 1936 1942 1945 1947 1962 1965 1965 1969 1972 1974 1984 1988 1991 1993
Time in microseconds for sorting CHILD 2: 83377 microseconds
Parent:
3 5 14 15 16 20 23 25 26 26 27 28 33 34 40 42 45 58 61 64 74 76 77 78 91 101 102 108 108 119 122 126 135 135 138 138 139 144 145 148 153 167 168 170 179 188 197 200 203 206 210 213 217 220 221 226 230 235 242 242 246 248 249 252 257 259 259 280 284 290 297 300 303 309 311 313 316 319 319 320 323 331 333 342 343 350 350 357 358 362 362 365 369 371 372 378 381 382 390 395 396 399 401 404 404 404 405 416 418 418 423 434 442 451 455 471 475 478 483 488 489 491 499 503 507 507 531 532 533 538 542 547 557 559 561 563 566 567 569 573 577 581 583 595 604 604 605 605 611 625 627 630 646 649 652 654 660 666 667 671 675 688 690 692 697 697 711 711 713 714 714 720 724 726 726 727 728 730 740 741 742 746 747 748 755 756 759 762 762 773 774 804 807 811 814 824 825 826 833 833 837 840 842 843 844 855 861 865 870 878 878 893 893 894 902 913 915 916 917 919 924 924 928 934 941 942 945 945 946 952 957 959 961 962 963 964 967 968 969 970 973 983 991 993 995 1006 1011 1030 1031 1033 1033 1043 1044 1053 1058 1061 1065 1068 1074 1075 1076 1082 1086 1092 1104 1107 1107 1110 1116 1121 1132 1138 1142 1143 1147 1153 1155 1155 1156 1159 1160 1162 1173 1177 1177 1179 1182 1183 1185 1190 1191 1195 1195 1201 1214 1214 1215 1216 1219 1221 1222 1231 1232 1235 1236 1242 1253 1258 1268 1271 1274 1276 1283 1284 1288 1296 1297 1297 1305 1308 1311 1311 1317 1330 1332 1333 1336 1336 1336 1341 1342 1343 1346 1349 1351 1354 1356 1360 1360 1365 1369 1370 1372 1380 1381 1392 1394 1395 1396 1401 1405 1414 1416 1417 1417 1419 1421 1427 1428 1436 1444 1446 1449 1450 1452 1459 1465 1466 1468 1471 1484 1491 1498 1508 1509 1513 1516 1520 1523 1530 1535 1540 1541 1546 1547 1550 1565 1569 1575 1582 1584 1589 1595 1602 1606 1622 1626 1629 1633 1635 1636 1644 1645 1647 1647 1650 1651 1653 1655 1657 1659 1660 1664 1665 1666 1673 1674 1678 1707 1712 1713 1715 1717 1718 1721 1722 1724 1724 1725 1732 1736 1738 1742 1744 1745 1746 1757 1760 1768 1772 1773 1774 1776 1781 1783 1785 1790 1791 1794 1796 1800 1801 1801 1802 1811 1813 1813 1817 1817 1825 1829 1833 1839 1842 1842 1848 1853 1858 1860 1861 1862 1862 1864 1878 1881 1883 1889 1891 1892 1896 1917 1925 1927 1927 1927 1936 1936 1942 1945 1947 1962 1965 1965 1969 1972 1974 1984 1988 1991 1993
2017-05-05 21:43:11
Time in microseconds for sorting Parent: 51 microseconds
Different runs showed dramatic variations in the processing time for child 2. I observed the values: 83,377; 73,929; 78,977; 83,977; 94,159; 81,526 microseconds.
You might get some benefit from threading with large data sets sorted by a small number of threads (say 10,000 rows of data, but only 8 threads, each sorting 1250 rows of data), but probably not even then. As you increase the number of threads beyond the number of cores on the system, you get less and less benefit from the multiple threads.

Read file with strings and integers in matlab

I have to read a file in Matlab that looks like this:
D:\Classified\positive-videos\vid.avi 163 3 14 32 54 79 105 130 155 202 216 224 238 250 262 288 288 322 357 369 381 438 457 478 499 525 551
D:\Classified\positive-videos\vid2.avi 163 3 14 32 54 79 105 130 155 202 216 224 238 250 262 288 288 322 357 369 381 438 457 478 499 525 551
There are many such lines separated by newline. I need to read it such that: I discard path of video name and first integer(eg 163 in first line) and read rest all the numbers in an array till new line occurs. How can this be done?
You could do the following:
fid = fopen('test1.txt','r');
my_line = fgetl(fid);
while(my_line ~= -1)
my_array = regexp(my_line,' ','split');
my_line = fgetl(fid);
disp(my_array(3:end));
end
fclose(fid);
This would give you:
ans =
Columns 1 through 11
'3' '14' '32' '54' '79' '105' '130' '155' '202' '216' '224'
Columns 12 through 22
'238' '250' '262' '288' '288' '322' '357' '369' '381' '438' '457'
Columns 23 through 26
'478' '499' '525' '551'
ans =
Columns 1 through 11
'3' '14' '32' '54' '79' '105' '130' '155' '202' '216' '224'
Columns 12 through 22
'238' '250' '262' '288' '288' '322' '357' '369' '381' '438' '457'
Columns 23 through 26
'478' '499' '525' '551'
EDIT
For a numeric matrix result you can change it as:
clear;
close;
clc;
fid = fopen('test1.txt','r');
my_line = fgetl(fid);
my_array = regexp(my_line,' ','split');
my_matrix = zeros(0, numel(my_array(3:end)));
ii = 1;
while(my_line ~= -1)
my_array = regexp(my_line,' ','split');
my_line = fgetl(fid);
my_matrix = [my_matrix;zeros(1,size(my_matrix,2))];
for jj=1:numel(my_array(3:end))
my_matrix(ii,jj) = str2num(cell2mat(my_array(jj+2)));
end
ii = ii + 1;
end
fclose(fid);
This would yeild:
my_matrix =
3 14 32 54 79 105 130 155 202 216 224 238 250 262 288 288 322 357 369 381 438 457 478 499 525 551
3 14 32 54 79 105 130 155 202 216 224 238 250 262 288 288 322 357 369 381 438 457 478 499 525 551
A way easier method follows up:
fid = importdata(filename)
results = fid.data;
Ad maiora.
EDIT
Since you wanna discard the first value after the string, you will have to call
res = fid.data(:,2:end);
instead of results.

Resources