IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed - arrays

I am trying to identify Global Feature Relationships with SHAP values. The SHAP library returns three matrices and I am trying to select the SHAP matrix however, I am getting this error: "IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed".
The code I have is below:
df_score = spark.sql("select * from sandbox.yt_trng_churn_device")
#XGBoost Model
import pickle
from xgboost import XGBClassifier
from mlflow.tracking import MlflowClient
client = MlflowClient()
local_dir = "/dbfs/FileStore/"
local_path = client.download_artifacts
model_path = '/dbfs/FileStore/'
model = XGBClassifier()
model = pickle.load(open(model_path, 'rb'))
HorizonDate = datetime.datetime(2022, 9, 5)
df = df_score
score_data = df.toPandas()
results = model.predict_proba(score_data)
results_l = model.predict(score_data)
score_data["p"]=pd.Series( (v[1] for v in results) )
score_data["l"]=pd.Series( (v for v in results_l) )
spark.createDataFrame(score_data).createOrReplaceTempView("yt_vw_tmp_dev__scores")
spark.sql("create or replace table sandbox.yt_vw_tmp_dev__scores as select * from yt_vw_tmp_dev__scores")
#SHAP Analysis on XGBoost
from shap import KernelExplainer, summary_plot
sql = """
select d_a.*
from
hive_metastore.sandbox.yt_trng_device d_a
right join
(select decile, msisdn, MSISDN_L2L
from(
select ntile(10) over (order by p desc) as decile, msisdn, MSISDN_L2L
from sandbox.yt_vw_tmp_dev__scores
) inc
order by decile) d_b
on d_a.MSISDN_L2L = d_b.MSISDN_L2L and d_a.msisdn = d_b.msisdn
"""
df = spark.sql(sql).drop('msisdn', 'imei', 'imsi', 'event_date', 'MSISDN_L2L', 'account_id')
score_df = df.toPandas()
mode = score_df.mode().iloc[0]
sample = score_df.sample(n=min(100, score_df.shape[0]), random_state=508502835).fillna(mode)
predict = lambda x: model.predict(pd.DataFrame(x, columns=score_df.columns))
explainer = KernelExplainer(predict, sample, link="identity")
shap_values = explainer.shap_values(sample, l1_reg=False)
# The return of the explainer has three matrices, we will get the shap values one
shap_values = shap_values[ :, :, 0]
I am fairly new to coding but it would be great if someone could give some direction on this

Related

Python : Replace a column in a dataframe by datetime values

I'm trying to replace a column an array of 4 columns by datetime values that I treated. The problem is that it's difficult to keep the same form between the different formats of dataframe, array,....
dataw = ds.variables["pr"][:]
dataw = np.array(dataw[:,0,0])
lat = ds.variables["lat"][:]
long = ds.variables["lon"][:]
time = ds.variables["time"][:]
time = pd.to_datetime(ds.variables["time"][:],origin=pd.Timestamp('1850-01-01'),unit='D')
#np.datetime64(ds.variables["time"][:],'D')
x2 = pd.DataFrame(np.zeros((len(dataw),4), float))
x = np.zeros((len(dataw),4), float)
x[:,0] = time
x[:,1] = long
x[:,2] = lat[:]
x[:,3] = dataw[:]*86400
x=pd.DataFrame(x)
x[:,0] = pd.to_datetime(time,origin=pd.Timestamp('1850-01-01'),unit='D')
If I put directly the dates transformed in the array, the result is like: 1.32542e+18
I tried
time = ds.variables["time"][:]
and include it in the array, and then use
x[:,0]=pd.to_datetime(x[:,0],origin=pd.Timestamp('1850-01-01'),unit='D')
I get the error:
TypeError: unhashable type: 'slice'
I tried also directly put:
time=pd.to_datetime(time,origin=pd.Timestamp('1850-01-01'),unit='D')
x[:,0] = time[:]
TypeError: unhashable type: 'slice'
try this instead
import numpy as np
import pandas as pd
dataw = ds.variables["pr"][:]
dataw = np.array(dataw[:, 0, 0])
lat = ds.variables["lat"][:]
long = ds.variables["lon"][:]
time = ds.variables["time"][:]
time = np.datetime64(time, 'D')
x = np.zeros((len(dataw), 4), dtype='datetime64[D]')
x[:, 0] = time
x[:, 1] = long
x[:, 2] = lat
x[:, 3] = dataw * 86400
df = pd.DataFrame(x, columns=["Time", "Longitude", "Latitude", "Data"])
Xarray makes the netcdf->pandas workflow quite straightforward:
import xarray as xr
ds = xr.open_dataset('file.nc', engine='netcdf4')
df = ds.to_pandas()
Presuming your time variable is using cf-conventions, Xarray will automatically decode it into datetime objects.

How to let the child process of one function finish and then run the second function?

Here I am simply calling a 3rd party API to get the prices of stocks through multiprocessing. I am using this function multiple times as I want the timeframe of stocks as (5 min, 10 min, 30 min). But when I run it, it does not wait for the previous functions to finish and instead move on to the last function to complete it. How to run each and every function in order ?
import pickle
import pandas as pd
import datetime
import multiprocessing
import time
import subprocess,os
def historical_data(timeframe):
global prices
def split_dict_equally(input_dict, chunks=2):
"Splits dict by keys. Returns a list of dictionaries."
# prep with empty dicts
return_list = [dict() for idx in range(chunks)]
idx = 0
for k,v in input_dict.items():
return_list[idx][k] = v
if idx < chunks-1: # indexes start at 0
idx += 1
else:
idx = 0
return return_list
with open('zerodha_login.pkl', 'rb') as file:
# Call load method to deserialze
login_credentials = pickle.load(file)
with open('zerodha_instruments.pkl', 'rb') as file:
# Call load method to deserialze
inst_dict = pickle.load(file)
csv = pd.read_csv('D:\\Business\\Website\\Trendlines\\FO Stocks.csv')
csv['Stocks'] = csv['Stocks'].str.replace(' ','')
fo_stocks = csv['Stocks'].to_list()
inst = pd.DataFrame(inst_dict)
filtered_inst = inst.copy()
filtered_inst = inst[(inst['segment'] == 'NSE') & (inst['name'] != '') & (inst['tick_size'] == 0.05) ]
filtered_inst = filtered_inst[filtered_inst['tradingsymbol'].isin(fo_stocks)]
tickers_dict = dict(zip(filtered_inst['instrument_token'],filtered_inst['tradingsymbol']))
tickers_dict = dict(zip(filtered_inst['instrument_token'],filtered_inst['tradingsymbol']))
number_process = 16
tickers_dict_list = split_dict_equally(tickers_dict,number_process)
def prices(stock):
print('inside_function',os.getpid())
for x,y in stock.items():
print('inside_stock_loop')
while True:
try:
print('Timeframe::',timeframe,y)
data = login_credentials['kite'].historical_data(instrument_token=x, from_date=today_date - datetime.timedelta(days=1000),interval=str(timeframe),to_date=today_date )
df = pd.DataFrame(data)
g = [e for e in df.columns if 'Un' not in e]
df = df[g]
df['date'] = df['date'].astype(str)
df['date'] = df['date'].str.split('+')
df['Date'] = df['date'].str[0]
df = df[['Date','open','high','low','close','volume']]
df['Date'] = pd.to_datetime(df['Date'],format='%Y-%m-%d %H:%M:%S')
df['Time'] = df['Date'].dt.time
df['Date'] = df['Date'].dt.date
df.rename(columns={'open':'Open','high':'High','low':'Low','close':'Close','volume':'Volume'},inplace=True)
df.to_csv('D:\\Business\\Website\\Trendlines\\4th Cut\\Historical data\\'+str(timeframe)+'\\'+str(y)+'.csv')
break
except:
print('Issue ::',y)
pass
new_list = []
if __name__ == '__main__':
for process in tickers_dict_list:
p = multiprocessing.Process(target=prices, args=(process,))
p.start()
new_list.append(p)
for p in new_list:
print('joining_',p)
p.join()
historical_data('5minute')
historical_data('10minute')

Incrementing over a URL variable

import urllib2
import pandas as pd
from bs4 import BeautifulSoup
x = 0
i = 1
data = []
while (i < 13):
soup = BeautifulSoup(urllib2.urlopen(
'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex=' % i, +str(x)).read(), 'html')
tableStats = soup.find("table", ("class", "playerTableTable tableBody"))
for row in tableStats.findAll('tr')[2:]:
col = row.findAll('td')
try:
name = col[0].a.string.strip()
opp = col[1].a.string.strip()
rec = col[10].string.strip()
yds = col[11].string.strip()
dt = col[12].string.strip()
pts = col[13].string.strip()
data.append([name, opp, rec, yds, dt, pts])
except Exception as e:
pass
df = pd.DataFrame(data=data, columns=[
'PLAYER', 'OPP', 'REC', 'YDS', 'TD', 'PTS'])
df
i += 1
I have been working with a fantasy football program and I am trying to increment data over all weeks so I can create a dataframe for the top 40 players for each week.
I have been able to get it for any week of my choice by manually entering the week number in the PeriodId part of the url, but I am trying to programmatically increment it over each week to make it easier. I have tried using PeriodId='+ I +' and PeriodId=%d but I keep getting various errors about str and int concatenate and bad operands. Any suggestions or tips?
Try removing the comma between %i and str(x) to concatenate the strings and see if that helps.
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i, +str(x)).read(), 'html')
should be:
soup = BeautifulSoup(urllib2.urlopen('http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%d&seasonId=2018&startIndex='%i +str(x)).read(), 'html')
if you have problem concatenating or formatting URL please create variable instead write it one line with BeautifulSoup and urllib2.urlopen.
Use parenthesis to format with multiple value like "before %s is %s" % (1, 0)
url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=%s' % (i, x)
# or
#url = 'http://games.espn.com/ffl/tools/projections?&slotCategoryId=4&scoringPeriodId=%s&seasonId=2018&startIndex=0' % i
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
Make the code sorter will not effect the performance.

R: how to properly create rx_forest_model object?

I'm trying to do a churn analysis with R and SQL Server 2016.
I have uploaded my dataset on my database in a local SQL Server and I did all the preliminary work on this dataset.
Well, now I have this function trainModel() which I would use to estimate my random model forest:
trainModel = function(sqlSettings, trainTable) {
sqlConnString = sqlSettings$connString
trainDataSQL <- RxSqlServerData(connectionString = sqlConnString,
table = trainTable,
colInfo = cdrColInfo)
## Create training formula
labelVar = "churn"
trainVars <- rxGetVarNames(trainDataSQL)
trainVars <- trainVars[!trainVars %in% c(labelVar)]
temp <- paste(c(labelVar, paste(trainVars, collapse = "+")), collapse = "~")
formula <- as.formula(temp)
## Train gradient tree boosting with mxFastTree on SQL data source
library(RevoScaleR)
rx_forest_model <- rxDForest(formula = formula,
data = trainDataSQL,
nTree = 8,
maxDepth = 16,
mTry = 2,
minBucket = 1,
replace = TRUE,
importance = TRUE,
seed = 8,
parms = list(loss = c(0, 4, 1, 0)))
return(rx_forest_model)
}
But when I run the function I get this wrong output:
> system.time({
+ trainModel(sqlSettings, trainTable)
+ })
user system elapsed
0.29 0.07 58.18
Warning message:
In tempGetNumObs(numObs) :
Number of observations not available for this data source. 'numObs' set to 1e6.
And for this warning message, the function trainModel() does not create the object rx_forest_model
Does anyone have any suggestions on how to solve this problem?
After several attempts, I found the reason why the function trainModel() did not function properly.
Is not a connection string problem and is not even a data source type issue.
The problem is in the syntax of function trainModel().
It is enough to eliminate from the body of the function the statement:
return(rx_forest_model)
In this way, the function returns the same warning message, but creates the object rx_forest_model in the correct way.
So, the correct function is:
trainModel = function(sqlSettings, trainTable) {
sqlConnString = sqlSettings$connString
trainDataSQL <- RxSqlServerData(connectionString = sqlConnString,
table = trainTable,
colInfo = cdrColInfo)
## Create training formula
labelVar = "churn"
trainVars <- rxGetVarNames(trainDataSQL)
trainVars <- trainVars[!trainVars %in% c(labelVar)]
temp <- paste(c(labelVar, paste(trainVars, collapse = "+")), collapse = "~")
formula <- as.formula(temp)
## Train gradient tree boosting with mxFastTree on SQL data source
library(RevoScaleR)
rx_forest_model <- rxDForest(formula = formula,
data = trainDataSQL,
nTree = 8,
maxDepth = 16,
mTry = 2,
minBucket = 1,
replace = TRUE,
importance = TRUE,
seed = 8,
parms = list(loss = c(0, 4, 1, 0)))
}

numpy slicing using user defined input

I have (in a larger project) data contained in numpy.array.
Based on user input I need to move a selected axis (dimAxisNr) to the first dimension of the array and slice one or more (including the first) dimension based on user input (such as Select2 and Select0 in the example).
Using this input I generate a DataSelect which contains the information needed to slice. But the output size of the sliced array is different from the one using inline indexing. So basically I need a way to generate the '37:40:2' and '0:2' from an input list.
import numpy as np
dimAxisNr = 1
Select2 = [37,39]
Select0 = [0,1]
plotData = np.random.random((102,72,145,2))
DataSetSize = np.shape(plotData)
DataSelect = [slice(0,item) for item in DataSetSize]
DataSelect[2] = np.array(Select2)
DataSelect[0] = np.array(Select0)
def shift(seq, n):
n = n % len(seq)
return seq[n:] + seq[:n]
#Sort and Slice the data
print(np.shape(plotData))
print(DataSelect)
plotData = np.transpose(plotData, np.roll(range(plotData.ndim),-dimAxisNr))
DataSelect = shift(DataSelect,dimAxisNr)
print(DataSelect)
print(np.shape(plotData))
plotData = plotData[DataSelect]
print(np.shape(plotData))
plotDataDirect = plotData[slice(0, 72, None), 37:40:2, slice(0, 2, None), 0:2]
print(np.shape(plotDataDirect))
I'm not sure I've understood your question at all...
But if the question is "How do I generate a slice based on a list of indices like [37,39,40,23] ?"
then I would answer : you don't have to, just use the list as is to select the right indices, like so :
a = np.random.rand(4,5)
print(a)
indices = [2,3,1]
print(a[0:2,indices])
Note that the sorting of the list matters: [2,3,1] yields a different result from [1,2,3]
Output :
>>> a
array([[ 0.47814802, 0.42069094, 0.96244966, 0.23886243, 0.86159478],
[ 0.09248812, 0.85569145, 0.63619014, 0.65814667, 0.45387509],
[ 0.25933109, 0.84525826, 0.31608609, 0.99326598, 0.40698516],
[ 0.20685221, 0.1415642 , 0.21723372, 0.62213483, 0.28025124]])
>>> a[0:2,[2,3,1]]
array([[ 0.96244966, 0.23886243, 0.42069094],
[ 0.63619014, 0.65814667, 0.85569145]])
I have found the answer to my question. I need to use numpy.ix_.
Here is the working code:
import numpy as np
dimAxisNr = 1
Select2 = [37,39]
Select0 = [0,1]
plotData = np.random.random((102,72,145,2))
DataSetSize = np.shape(plotData)
DataSelect = [np.arange(0,item) for item in DataSetSize]
DataSelect[2] = Select2
DataSelect[0] = Select0
#print(list(37:40:2))
def shift(seq, n):
n = n % len(seq)
return seq[n:] + seq[:n]
#Sort and Slice the data
print(np.shape(plotData))
print(DataSelect)
plotData = np.transpose(plotData, np.roll(range(plotData.ndim),-dimAxisNr))
DataSelect = shift(DataSelect,dimAxisNr)
plotDataSlice = plotData[np.ix_(*DataSelect)]
print(np.shape(plotDataSlice))
plotDataDirect = plotData[slice(0, 72, None), 37:40:2, slice(0, 2, None), 0:1]
print(np.shape(plotDataDirect))

Resources