I am currently trying to crop a retangular xarray file to the shape of a country using a mask grid. Below you can find my current solution (with simpler and smaller arrays). The code works and I get the desired mask based on 1s and 0s. The problem lies on the fact that the code when run on a real country shape (larger and more complex) takes over 30 minutes to run. Since I am using very basic operations here like nested for loops, I also tried different alternatives like a list approach. However, when timing the process, it did not improve on the code below. I wonder if there is a faster way to obtain this mask (vectorization?) or if I should approach the problem in a different way (tried exploring xarray's properties, but have not found anything that tackles this issue yet).
Code below:
import geopandas as gpd
from shapely.geometry import Polygon, Point
import pandas as pd
import numpy as np
import xarray as xr
df = pd.read_csv('Brazil_borders.csv',index_col=0)
lats = np.array([-20, -5, -5, -20,])
lons = np.array([-60, -60, -30, -30])
lats2 = np.array([-10.25, -10.75, -11.25, -11.75, -12.25, -12.75, -13.25, -13.75,
-14.25, -14.75, -15.25, -15.75, -16.25, -16.75, -17.25, -17.75,
-18.25, -18.75, -19.25, -19.75, -20.25, -20.75, -21.25, -21.75,
-22.25, -22.75, -23.25, -23.75, -24.25, -24.75, -25.25, -25.75,
-26.25, -26.75, -27.25, -27.75, -28.25, -28.75, -29.25, -29.75,
-30.25, -30.75, -31.25, -31.75, -32.25, -32.75])
lons2 = np.array([-61.75, -61.25, -60.75, -60.25, -59.75, -59.25, -58.75, -58.25,
-57.75, -57.25, -56.75, -56.25, -55.75, -55.25, -54.75, -54.25,
-53.75, -53.25, -52.75, -52.25, -51.75, -51.25, -50.75, -50.25,
-49.75, -49.25, -48.75, -48.25, -47.75, -47.25, -46.75, -46.25,
-45.75, -45.25, -44.75, -44.25])
points = []
for i in range(len(lats)):
_= [lats[i],lons[i]]
points.append(_)
poly_proj = Polygon(points)
mask = np.zeros((len(lats2),len(lons2))) # Mask with the dataset's shape and size.
for i in range(len(lats2)): # Iteration to verify if a given coordinate is within the polygon's area
for j in range(len(lons2)):
grid_point = Point(lats2[i], lons2[j])
if grid_point.within(poly_proj):
mask[i][j] = 1
bool_final = mask
bool_final
The alternative based on list approach, but with even worse processing time (according to timeit):
lats = np.array([-20, -5, -5, -20,])
lons = np.array([-60, -60, -30, -30])
lats2 = np.array([-10.25, -10.75, -11.25, -11.75, -12.25, -12.75, -13.25, -13.75,
-14.25, -14.75, -15.25, -15.75, -16.25, -16.75, -17.25, -17.75,
-18.25, -18.75, -19.25, -19.75, -20.25, -20.75, -21.25, -21.75,
-22.25, -22.75, -23.25, -23.75, -24.25, -24.75, -25.25, -25.75,
-26.25, -26.75, -27.25, -27.75, -28.25, -28.75, -29.25, -29.75,
-30.25, -30.75, -31.25, -31.75, -32.25, -32.75])
lons2 = np.array([-61.75, -61.25, -60.75, -60.25, -59.75, -59.25, -58.75, -58.25,
-57.75, -57.25, -56.75, -56.25, -55.75, -55.25, -54.75, -54.25,
-53.75, -53.25, -52.75, -52.25, -51.75, -51.25, -50.75, -50.25,
-49.75, -49.25, -48.75, -48.25, -47.75, -47.25, -46.75, -46.25,
-45.75, -45.25, -44.75, -44.25])
points = []
for i in range(len(lats)):
_= [lats[i],lons[i]]
points.append(_)
poly_proj = Polygon(points)
grid_point = [Point(lats2[i],lons2[j]) for i in range(len(lats2)) for j in range(len(lons2))]
mask = [1 if grid_point[i].within(poly_proj) else 0 for i in range(len(grid_point))]
bool_final2 = np.reshape(mask,(((len(lats2)),(len(lons2)))))
Thank you in advance!
Based on this answer from snowman2, I created this simple function that provides a much faster solution by using geopandas and rioxarray. Instead of using a list of latitudes and longitudes, one has to use a shapefile with the desired shape to be masked (Instructions for GeoDataFrame creation from list of coordinates).
import xarray as xr
import geopandas as gpd
import rioxarray
from shapely.geometry import mapping
def mask_shape_border (DS,shape_shp): #Inputs are the dataset to be cropped and the address of the mask file (.shp )
if 'lat' in DS: #Some datasets use lat/lon, others latitude/longitude
DS.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
elif 'latitude' in DS:
DS.rio.set_spatial_dims(x_dim="longitude", y_dim="latitude", inplace=True)
else:
print("Error: check latitude and longitude variable names.")
DS.rio.write_crs("epsg:4326", inplace=True)
mask = gpd.read_file(shape_shp, crs="epsg:4326")
DS_clipped = DS.rio.clip(mask.geometry.apply(mapping), mask.crs, drop=False)
return(DS_clipped)
I need to recognize the text in the bottom left corner on Magic the Gathering paper cards (last design). Here an example:
If the text is like this
I want to retrieve the following text:
198/280 U
M20 EN
(I don't need the card author name - Lake Hurwitz in this example)
What OCR library can I use? I've tried with Tesseract without any tuning but the results are not correct. Any advice or link to a project that already does this stuff?
You can make it with tesseract (3.04.01) by sanitizing your image a bit
like in below code
import numpy as np
import cv2
def prepro(zone, prefix):
filename = 'stackmagic.png'
oriimg = cv2.imread(filename)
#keep the interesting part
(a,b,c,d) = zone
text_zone = oriimg[a:b, c:d]
height, width, depth = text_zone.shape
#resize it to be bigger (so less pixelized)
H = 50
imgScale = H/height
newX,newY = text_zone.shape[1]*imgScale, text_zone.shape[0]*imgScale
newimg = cv2.resize(text_zone,(int(newX),int(newY)))
#binarize it
gray = cv2.cvtColor(newimg, cv2.COLOR_BGR2GRAY)
th, img = cv2.threshold(gray, 130, 255, cv2.THRESH_BINARY);
#erode it
kernel = np.ones((1,1),np.uint8)
erosion = cv2.erode(img,kernel,iterations = 1)
cv2.imwrite(prefix+'_ero.png', erosion)
cv2.imshow("Show by CV2",erosion)
cv2.waitKey(0)
prepro((16,27, 6,130), 'upzone')
prepro((27,36, 6,130), 'downzone')
from your cropped image
you get
the upper part:
and the lower part:
and tesseract does seem to be able to extract
xx$ tesseract upzone_ero.png stdout
198/ 280 U
xx$ tesseract downzone_ero.png stdout
M20 ~ EN Duluu Hun-nu
Notice that we fail to extract Luke, but hopefully you were not interested in him/it :)
There are other tools but that'd be advertising stuff and be subjective..
I'm trying to embed python in my C application. I download the package in python official website and manage to do a simple Hello World.
Now I want to go deeper and use some libraries of python like numpy, keras, tensorflow...
I'm working with Python 3.5.4, I installed all the needed package on my PC with pip3 :
pip3 install keras
pip3 install tensorflow
...
then I created my script and launch it in python environment, it works fine :
Python:
# Importing the libraries
#
import numpy as np
import pandas as pd
dataset2 = pd.read_csv('I:\RNA\dataset19.csv')
X_test = dataset2.iloc[:, 0:228].values
y_test = dataset2.iloc[:, 228].values
# 2.
import pickle
sc = pickle.load(open('I:\RNA\isVerb_sc', 'rb'))
X_test = sc.transform(X_test)
# 3.
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
classifier = Sequential()
classifier.add(Dense(units = 114, kernel_initializer = 'uniform', activation = 'relu', input_dim = 228))
classifier.add(Dropout(p = 0.3))
classifier.add(Dense(units = 114, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(p = 0.3))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
classifier.load_weights('I:\RNA\isVerb_weights.h5')
y_pred = classifier.predict(X_test)
y_pred1 = (y_pred > 0.5)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
But when I execute the same script in a C environment with embed python it didn't work :
At first, I execute my script directly with PyRun_SimpleFile with no luck, so I sliced it in multiple instructions with PyRun_SimpleString to detect the problem :
C:
result = PyRun_SimpleString("import numpy as np"); // result = 0 (ok)
result = PyRun_SimpleString("import pandas as pd"); // result = 0 (ok)
...
result = PyRun_SimpleString("import pickle"); // result = 0 (ok)
... (all insctruction above works)
result = PyRun_SimpleString("import keras"); // result = -1 !!
... (all under this failed)
but there is not a single stack trace about this error, I tried this but I just got :
"Here's the output: (null)"
My initialization of Python in C seems correct since others libraries import fine :
// Python
wchar_t *stdProgramName = L"I:\\LIBs\\cpython354";
Py_SetProgramName(stdProgramName);
wchar_t *stdPythonHome = L"I:\\LIBs\\cpython354";
Py_SetPythonHome(stdPythonHome);
wchar_t *stdlib = L"I:\\LIBs\\cpython354;I:\\LIBs\\cpython354\\Lib\\python35.zip;I:\\LIBs\\cpython354\\Lib;I:\\LIBs\\cpython354\\DLLs;I:\\LIBs\\cpython354\\Lib\\site-packages";
Py_SetPath(stdlib);
// Initialize Python
Py_Initialize();
When inside a Python cmd, the line import keras take some time (3sec) but works (a warning but I found no harm around it) :
>>> import keras
I:\LIBs\cpython354\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Using TensorFlow backend.
>>>
I'm at loss now, I don't know where to look at since there is no stack trace.
it seems like when you import keras, it executes this line :
sys.stderr.write('Using TensorFlow backend.\n')
or sys.stderr was not defined in python embedded on windows
A simple correction is to define sys.stderr, for example :
import sys
class CatchOutErr:
def __init__(self):
self.value = ''
def write(self, txt):
self.value += txt
catchOutErr = CatchOutErr()
sys.stderr = catchOutErr
Is there a way to have the equivalent of the Python try statement in Cython for the cimport?
Something like that:
try:
cimport something
except ImportError:
pass
I would need this to write a Cython extension that can be compiled with or without mpi4py. This is very standard in compiled languages where the mpi commands can be put between #ifdef and #endif preprocessor directives. How can we obtain the same result in Cython?
I tried this but it does not work:
try:
from mpi4py import MPI
from mpi4py cimport MPI
from mpi4py.mpi_c cimport *
except ImportError:
rank = 0
nb_proc = 1
# solve a incompatibility between openmpi and mpi4py versions
cdef extern from 'mpi-compat.h': pass
does_it_work = 'Not yet'
Actually it works well if mpi4py is correctly installed but if
import mpi4py raises an ImportError, the Cython file does not
compile and I get the error:
Error compiling Cython file:
------------------------------------------------------------
...
try:
from mpi4py import MPI
from mpi4py cimport MPI
^
------------------------------------------------------------
mod.pyx:4:4: 'mpi4py.pxd' not found
The file setup.py:
from setuptools import setup, Extension
from Cython.Distutils import build_ext
import os
here = os.path.abspath(os.path.dirname(__file__))
include_dirs = [here]
try:
import mpi4py
except ImportError:
pass
else:
INCLUDE_MPI = '/usr/lib/openmpi/include'
include_dirs.extend([
INCLUDE_MPI,
mpi4py.get_include()])
name = 'mod'
ext = Extension(
name,
include_dirs=include_dirs,
sources=['mod.pyx'])
setup(name=name,
cmdclass={"build_ext": build_ext},
ext_modules=[ext])
Using a try-catch block in this way is something you won't be able to do.
The extension module you are making must be statically compiled and linked against the things it uses cimport to load at the C-level. A try-catch block is something that will be executed when the module is imported, not when it is compiled.
On the other hand, in theory, you should be able to get the effect you're looking for using Cython's support for conditional compilation.
In your setup.py file you can check to see if the needed modules are defined and then define environment variables to be passed to the Cython compiler that, in turn, depend on whether or not the needed modules are present.
There's an example of how to do this in one of Cython's tests.
There they pass a dictionary containing the desired environment variables to the constructor for Cython's Extension class as the keyword argument pyrex_compile_time_env, which has been renamed to cython_compile_time_env, and for Cython.Build.Dependencies.cythonize is called compile_time_env).
Thank you for your very useful answer #IanH. I include an example to show what it gives.
The file setup.py:
from setuptools import setup
from Cython.Distutils.extension import Extension
from Cython.Distutils import build_ext
import os
here = os.path.abspath(os.path.dirname(__file__))
import numpy as np
include_dirs = [here, np.get_include()]
try:
import mpi4py
except ImportError:
MPI4PY = False
else:
MPI4PY = True
INCLUDE_MPI = '/usr/lib/openmpi/include'
include_dirs.extend([
INCLUDE_MPI,
mpi4py.get_include()])
name = 'mod'
ext = Extension(
name,
include_dirs=include_dirs,
cython_compile_time_env={'MPI4PY': MPI4PY},
sources=['mod.pyx'])
setup(name=name,
cmdclass={"build_ext": build_ext},
ext_modules=[ext])
if not MPI4PY:
print('Warning: since importing mpi4py raises an ImportError,\n'
' the extensions are compiled without mpi and \n'
' will work only in sequencial.')
And the file mod.pyx, with a little bit of real mpi commands:
import numpy as np
cimport numpy as np
try:
from mpi4py import MPI
except ImportError:
nb_proc = 1
rank = 0
else:
comm = MPI.COMM_WORLD
nb_proc = comm.size
rank = comm.Get_rank()
IF MPI4PY:
from mpi4py cimport MPI
from mpi4py.mpi_c cimport *
# solve an incompatibility between openmpi and mpi4py versions
cdef extern from 'mpi-compat.h': pass
print('mpi4py ok')
ELSE:
print('no mpi4py')
n = 8
if n % nb_proc != 0:
raise ValueError('The number of processes is incorrect.')
if rank == 0:
data_seq = np.ones([n], dtype=np.int32)
s_seq = data_seq.sum()
else:
data_seq = np.zeros([n], dtype=np.int32)
if nb_proc > 1:
data_local = np.zeros([n/nb_proc], dtype=np.int32)
comm.Scatter(data_seq, data_local, root=0)
else:
data_local = data_seq
s = data_local.sum()
if nb_proc > 1:
s = comm.allreduce(s, op=MPI.SUM)
if rank == 0:
print('s: {}; s_seq: {}'.format(s, s_seq))
assert s == s_seq
Build with python setup.py build_ext --inplace and test with python -c "import mod" and mpirun -np 4 python -c "import mod". If mpi4py is not installed, one can still build the module and use it in sequential.