Saving mixed data cell array to ascii file in MATLAB - arrays

I get some data from an instrument that is formatted in a specific way. I need to load the data into MATLAB, manipulate some values, then save it back with the same format to load back into the instrument software for further analysis...
The issue I am having is the data is of mixed value types and they are kind of all over the place.
The file is tab delimited, I have added arrows eg --> to show the location of the tabs (like notepad++ does)
Scan-42/01
Temperature [K] :--> 295.00
Time [s] :--> 60
"Linspace"
0.01--> 0.96
0.02--> 0.95
0.03--> 0.95
"Logspace"
0.01--> 0.96
0.02--> 0.95
0.04--> 0.94
The data keeps going down but I have cut it off after 3 rows.
The data I need to manipulate will be the Temperature, and some of the values under Linspace and Logspace.
I am currently importing the data like this:
filename = 'test.asc';
delimiter = '\t';
formatSpec = '%s%s%[^\n\r]';
fileID = fopen(filename,'r');
dataArray = textscan(fileID, formatSpec, 'Delimiter', delimiter, 'ReturnOnError', false);
Data in MATLAB looks like this:
Even if I could set up some kind of template in MATLAB where I could get the values nesessary, and then save them in excactly this format would work fine. The file must be saved as .asc, or the instrument will reject it.
Help is greatly appreciated.
Thanks

Hope this would work for you.
Code
%%// Note: file1 is your input .asc filename and file2 is the output .asc.
%%// Please specify their names before running this.
%%// **** Read in file data ****
fid = fopen(file1,'r');
A = importdata(file1,'\n')
%%// Delimiters (mind these assumptions)
linlog_delim1 = '--> ';
temperature_delim1 = 'Temperature [K] :--> ';
sep1 = cellfun(#(x) isequal(x,''),A)
sep1 = [sep1 ;1]
sep_ind = find(sep1)
full_data = regexp(A,linlog_delim1,'split')
%%// Temperature value
temp_ind = find(~cellfun(#isempty,strfind(A,'Temperature [K] :-->')))
temp_val = str2num(cell2mat(full_data{temp_ind,:}(1,2)))
%%// Linspace values
sep_linspace = cellfun(#(x) isequal(x,'"Linspace"'),A)
lin_start_ind = find(sep_linspace)+1
lin_stop_ind = sep_ind(find(sep_ind>lin_start_ind,1,'first'))-1
linspace_data = vertcat(full_data{lin_start_ind:lin_stop_ind})
linspace_valid_ind = cellfun(#str2num,linspace_data(:,1))
linspace_valid_val = cellfun(#str2num,linspace_data(:,2))
%%// Logspace values
sep_linspace = cellfun(#(x) isequal(x,'"Logspace"'),A)
log_start_ind = find(sep_linspace)+1
log_stop_ind = sep_ind(find(sep_ind>log_start_ind,1,'first'))-1
logpace_data = vertcat(full_data{log_start_ind:log_stop_ind})
logspace_valid_ind = cellfun(#str2num,logpace_data(:,1))
logspace_valid_val = cellfun(#str2num,logpace_data(:,2))
%%// **** Let us modify some data ****
temp_val = temp_val + 10;
linspace_valid_val_mod1 = linspace_valid_val+[1 2 3]'; %%//'
logspace_valid_val_mod1 = logspace_valid_val+[1 20 300]'; %%//'
%%// **** Write back file data ****
%%// Write back temperature data
A(temp_ind) = {[temperature_delim1,num2str(temp_val)]}
%%// Write back linspace data
mod_lin_val = cellfun(#strtrim,cellstr(num2str(linspace_valid_val_mod1)),'uni',0)
mod_lin_ind = cellstr(num2str(linspace_valid_ind))
sep_lin = repmat({linlog_delim1},numel(mod_lin_val),1)
A(lin_start_ind:lin_stop_ind)=cellfun(#horzcat,mod_lin_ind,sep_lin,mod_lin_val,'uni',0)
%%// Write back logspace data
mod_log_val = cellfun(#strtrim,cellstr(num2str(logspace_valid_val_mod1)),'uni',0)
mod_log_ind = cellstr(num2str(logspace_valid_ind))
sep_log = repmat({linlog_delim1},numel(mod_log_val),1)
A(log_start_ind:log_stop_ind)=cellfun(#horzcat,mod_log_ind,sep_log,mod_log_val,'uni',0)
%%// Remove leading whitespaces
A = strtrim(A)
%%// Write the modified data
fid2 = fopen(file2,'w');
for row = 1:numel(A)
fprintf(fid2,'%s\n',A{row,:});
end
fclose(fid);
fclose(fid2);
Changes for the demo:
Temperature has 10 added.
"Linspace" has 1 2 and 3 added to it's elements respectively.
"Logspace" has 1 20 and 300 added to it's elements respectively.
Results
Before -
Scan-42/01
Temperature [K] :--> 295.00
Time [s] :--> 60
"Linspace"
0.01--> 0.96
0.02--> 0.95
0.103--> 0.95
"Logspace"
0.01--> 0.96
0.02--> 0.95
0.04--> 0.94
After -
Scan-42/01
Temperature [K] :--> 305
Time [s] :--> 60
"Linspace"
0.01--> 1.96
0.02--> 2.95
0.103--> 3.95
"Logspace"
0.01--> 1.96
0.02--> 20.95
0.04--> 300.94
Edit 1:
Code
%%// I-O filenames
input_filename = 'gistfile1.txt';
output_file = 'gistfile1_out.txt';
%%// Get data from input filename
delimiter = '\t';
formatSpec = '%s%s%[^\n\r]';
fid = fopen(input_filename,'r');
dataArray = textscan(fid, formatSpec, 'Delimiter', delimiter, 'ReturnOnError', false);
%%// Get data into A
A(:,1) = dataArray{1,1}
A(:,2) = dataArray{1,2}
%%// Find separator indices
ind1 = find([cellfun(#(x) isequal(x,''),A(:,2));1])
temperature_ind = find(~cellfun(#isempty,strfind(A,'Temperature')))
temperature_val = str2num(cell2mat(A(temperature_ind,2)))
%%// Linspace values
sep_linspace = cellfun(#(x) isequal(x,'"Linspace"'),A(:,1))
lin_start_ind = find(sep_linspace)+1
lin_stop_ind = ind1(find(ind1>lin_start_ind,1,'first'))-1
linspace_valid_ind = cellfun(#str2num,A(lin_start_ind:lin_stop_ind,1))
linspace_valid_val = cellfun(#str2num,A(lin_start_ind:lin_stop_ind,2))
%%// Logspace values
sep_logspace = cellfun(#(x) isequal(x,'"Logspace"'),A(:,1))
log_start_ind = find(sep_logspace)+1
log_stop_ind = ind1(find(ind1>log_start_ind,1,'first'))-1
logspace_valid_ind = cellfun(#str2num,A(log_start_ind:log_stop_ind,1))
logspace_valid_val = cellfun(#str2num,A(log_start_ind:log_stop_ind,2))
%%// **** Let us modify some data ****
temp_val_mod1 = temperature_val + 10;
linspace_valid_val_mod1 = linspace_valid_val+[1:numel(linspace_valid_val)]';
logspace_valid_val_mod1 = logspace_valid_val+10.*[1:numel(logspace_valid_val)]';
%%// **** Write back file data into A ****
A(temperature_ind,2) = cellstr(num2str(temp_val_mod1))
A(lin_start_ind:lin_stop_ind,2) = cellstr(num2str(linspace_valid_val_mod1))
A(log_start_ind:log_stop_ind,2) = cellstr(num2str(logspace_valid_val_mod1))
%%// Write the modified data
fid2 = fopen(output_file,'w');
for row = 1:size(A,1)
fprintf(fid2,'%s\t%s\n',A{row,1},A{row,2});
end
%%// Close files
fclose(fid);
fclose(fid2);
Results
Before -
Scan-42/01
Temperature [K] : 295.00
Time [s] : 60
"Linspace"
0.01 0.96
0.02 0.95
0.03 0.95
"Logspace"
0.01 0.96
0.02 0.95
0.04 0.94
After -
Scan-42/01
Temperature [K] : 305
Time [s] : 60
"Linspace"
0.01 1.96
0.02 2.95
0.03 3.95
"Logspace"
0.01 10.96
0.02 20.95
0.04 30.94
Please note that the only formatting difference between input and output files is that there is no whitespaced row between "Linspace" and the previous row in the output file, as was there in the input file. This is seen similarly for "Logspace".

I've solved a nearly identical problem once before. The solution goes something like this:
First, you're already splitting your data up into chunks, so that's good. Judging by your comment, it seems that the data is consistently formatted from file to file, but inconsistently formatted in each individual file. That's fine.
What you need to do is iterate through dataArray, and find each unique label (Such as "Linspace") and track that labels index. What you'll end up with is a vector of indices that tell you exactly where in dataArray these labels appear. Once you have all of the labels indices, you need to look at the dataArray, and see how the data between each label is formatted. Then you'll write some code to break dataArray into sub-arrays. You'll need to write a different sub-array parser for each format.
I know that's a little abstract, so let me try to give you an example.
timeIndex = find(strcmp(dataArray, 'Time'), 1);
linespaceIndex = find(strcmp(dataArray, '"linSpace"'), 1);
logespaceIndex = find(strcmp(dataArray, '"logSpace"'), 1);
linSpaceData = dataArray(linspaceIndex+3:logspaceIndex-1); % This is the "sub-array" I was refering to. It's a little piece of dataArray that contains only the linspace data values.
This is just an example, and will probably not plug-and-play, it's just meant to be a thought-provoker. Note the +3 and -1, those were just guessed. You'll have to empirically determine those for each range, as lings like tabs, colons, and spaces can get in the way. That should be enough to get you started on your problem. Let me know if you need clarification, or if this isn't helpful. Good luck!
-Fletch

Related

How to do SUM on array from outside file?

I'm newbie college student for programming studies,
so recently i have task to calculate matrix from outside files for Gauss Jordan Numeric Method, in the txt file i provide has 10 (x) and (y) data, and declare with do functions to calculate the 10 data from the txt file each for x^2, x^3, x^4, xy, x^2y
my question is : how to SUM (calculate total) each x^2, x^3 ... that was calculated by program ? i try do sum file in below and still got errors (the first argument of sum must not a scalar.)
the Fortran apps i use was Plato cc from Silverfrost.
I apologize if my english bad and my pogram looks funny.
i have 10 data in my txt looks like these :
(x) (y)
12 10
5 6
28 8
9 11
20 17
6 24
32 9
2 7
1 30
26 22
in program below i open these files and want each x and y i provide read and calculate to get x^2, x^3, x^4, xy, x^2y
Program Gauss_Jordan
Real x(10),y(10),xj,yj,xj2,xj3,xj4,xjyj,xj2yj
Open (10, file='Data.txt')
Do j = 1,10
Read(10,*) x(j), y(j)
xj2 = x(j)**2
xj3 = x(j)**3
xj4 = x(j)**4
xjyj = x(j)*y(j)
xj2yj = (x(j)**2)*y(j)
Do k = 1,10
T(xj2) = SUM( xj2, dim=1)
T(xj3) = SUM (xj3, dim=1)
T(xj4) = SUM (xj4, dim=1)
T(xjyj) = SUM (xjyj, dim=1)
T(xj2yj) = SUM (xj2yj, dim=1)
End Do
End Do
Close(10)
End
for T(xj2) I want to get one result scalar result from SUM the all xj^2 that program has been calculated.
Like in excel was expected :
(A) is 1st xj^2 value that has been calculated
.
.
.
until (J) is 10th xj^2 value that has been calculated
sxj^2 = SUM(Xj^2)
SUM (A-J)
The 'sum' intrinsic needs an array argument, which we can compute from the input arrays without using a loop, so your program could be:
Program Gauss_Jordan
Real x(10), y(10), x2(10), x3(10), x4(10), xy(10), x2y(10)
Open(10, file='Data.txt')
Do j = 1, 10
Read (10, *) x(j), y(j)
End Do
Close(10)
x2 = x**2
x3 = x**3
x4 = x**4
xy = x*y
x2y = (x**2)*y
sx2 = SUM(x2)
sx3 = SUM(x3)
sx4 = SUM(x4)
sxy = SUM(xy)
sx2y = SUM(x2y)
End
From what I see I think you are misunderstanding what the SUM intrinsic does. Since your example isn't storing xj2, xj3 etc. in arrays, SUM isn't going to be useful to you. Instead you could declare totals as scalars (as you described you wanted) and simply add the individual xj2 variables in a loop as in the example below.
Also, you should get in the habit of using the implicit none declaration. It will save you from unexpected errors due to spelling mistakes.
Program Gauss_Jordan
implicit none
Real x(10),y(10),xj,yj,xj2,xj3,xj4,xjyj,xj2yj
real :: Txj2,Txj3,Txj4,Txjyj,Txj2yj
integer :: j
Txj2 = 0
Txj3 = 0
Txj4 = 0
Txjyj= 0
Txj2yj= 0
Open (10, file='Data.txt')
Do j = 1,10
Read(10,*) x(j), y(j)
xj2 = x(j)**2
xj3 = x(j)**3
xj4 = x(j)**4
xjyj = x(j)*y(j)
xj2yj = (x(j)**2)*y(j)
Txj2 = Txj2 + xj2
Txj3 = Txj3 + xj3
Txj4 = Txj4 + xj4
Txjyj = Txjyj + xjyj
Txj2yj = Txj2yj + xj2yj
End Do
print *, 'Txj2 = ', Txj2
Close(10)
End
When I ran this I got the output below which is what I believe you intended:
3175

SAS array unable to process long list of variables

I am trying to log, square, cubic and log-odds transform my input data to provide an exhaustive overview of the best performing transformation in univariate regression
I have tried the following code on a dataset with 1,000 variables - It returns an error / runs out of memory or simply cannot execute. Are there any limitations with transforming variables en-masse in this way using arrays?
/*Create a table for reference*/
DATA input_data;
ARRAY var_[*] var_1-var_1000;
DO i = 1 to 1000;
DO i = 1 to 1000;
var_(i)= i*j;
output;
END;
END;
RUN;
/*Log, square, cubic, logit transform all variables*/
DATA input_transform;
SET input_data;
ARRAY var[*] var_1-var_1000;
ARRAY log[*] log_1-log_1000;
ARRAY logit[*] logit_1-logit_1000;
ARRAY sq[*] sq_1-sq_1000;
ARRAY cubic[*] cubic_1-cubic_1000;
DO i = 1 to 1000;
log(i) = log(var(i));
logit(i) = log((var(i))/(1-var(i)));
sq(i) = var(i)**2;
cubic(i) = var(i)**3;
END;
RUN;
A new dataset with 5000 variables each with the respective transformation
You are using I as the index variable for both or your two nested do loops. That is probably messing them up.
Also your first data step is writing 1,000,000 observations of 1,002 variables with only the lower left triangle of the "array" filled in. Do you really want the OUTPUT statement inside the loop?
Hypothetically there are no issues with this, as long as your code is correct. Here's an example and the log.
option notes;
%let size=1000;
/*Create a table for reference*/
DATA input_data;
ARRAY var_[*] var_1-var_&size.;
DO i = 1 to &size.;
DO j = 1 to &size.;
var_(j)= i*j;
END;
output;
END;
RUN;
/*Log, square, cubic, logit transform all variables*/
DATA input_transform;
SET input_data;
ARRAY _var[*] var_1-var_&size.;
ARRAY _log[*] log_1-log_&size.;
ARRAY _logit[*] logit_1-logit_&size.;
ARRAY _sq[*] sq_1-sq_&size.;
ARRAY _cubic[*] cubic_1-cubic_&size.;
DO i = 1 to &size.;
_log(i) = log(_var(i));
_logit(i) = sqrt(_var(i));
_sq(i) = _var(i)**2;
_cubic(i) = _var(i)**3;
END;
RUN;
and the log:
1576 option notes;
1577 %let size=1000;
1578
1579 /*Create a table for reference*/
1580 DATA input_data;
1581 ARRAY var_[*] var_1-var_&size.;
1582
1583 DO i = 1 to &size.;
1584 DO j = 1 to &size.;
1585 var_(j)= i*j;
1586 END;
1587 output;
1588 END;
1589 RUN;
NOTE: The data set WORK.INPUT_DATA has 1000 observations and 1002
variables.
NOTE: DATA statement used (Total process time):
real time 0.03 seconds
cpu time 0.03 seconds
1590
1591 /*Log, square, cubic, logit transform all variables*/
1592 DATA input_transform;
1593 SET input_data;
1594 ARRAY _var[*] var_1-var_&size.;
1595 ARRAY _log[*] log_1-log_&size.;
1596 ARRAY _logit[*] logit_1-logit_&size.;
1597 ARRAY _sq[*] sq_1-sq_&size.;
1598 ARRAY _cubic[*] cubic_1-cubic_&size.;
1599
1600 DO i = 1 to &size.;
1601 _log(i) = log(_var(i));
1602 _logit(i) = sqrt(_var(i));
1603 _sq(i) = _var(i)**2;
1604 _cubic(i) = _var(i)**3;
1605 END;
1606 RUN;
NOTE: There were 1000 observations read from the data set
WORK.INPUT_DATA.
NOTE: The data set WORK.INPUT_TRANSFORM has 1000 observations and 5002
variables.
NOTE: DATA statement used (Total process time):
real time 0.12 seconds
cpu time 0.10 seconds

matlab complex for-loop correlation calcul

This is the script that I have. It works till the ------ separation. Under I do not get any error from Matlab, but neither do I get a return of bestDx nor bestDy. Please help. (The first part is given just to put you in context)
%%
% Variables after running script Read_eA3_file.m
%date_time_UTC
%reflectivity
%clutter_mask
%Convert units
dBZ = reflectivity * 0.375 - 30;
dBZ_Mask = clutter_mask * 0.375 - 30;
%Replace clutter values with NaN
weather = NaN(size(dBZ)); %initialise to constant
weather(dBZ>=dBZ_Mask) = dBZ(dBZ>=dBZ_Mask); %copy values when A >= B
%Reduce to range -- those are 384x384 arrays
dBZ_range = dBZ(:,:,1:16); %16:18 to 16:23 included
weather_range = weather(:,:,1:16); %16:18 to 16:23 included
weather1618 = weather(:,:,1); %16:18 map only
weather1623 = weather(:,:,16); %16:23 map only
% Plot maps
image(imrotate(-weather1618,90)); %of 16:18
image(imrotate(-weather1623,90)); %of 16:23
%Find x,y of strongest dBZ
%Since the value are all negative. I look for their minimun
[M,I] = min(weather1618(:)); %for 16:18
[I_row, I_col] = ind2sub(size(weather1618),I); %values are 255 and 143
[M2,I2] = min(weather1623(:)); %for 16:23
[I2_row, I2_col] = ind2sub(size(weather1623),I2); %values are 223 and 7
%Calc displacement
%I get a value of 139.7140
max_displ=sqrt((I2_row-I_row)^2+(I2_col-I_col)^2); %between 1618 and 1623
%%
% -----Section below does not work; ONLY RUN the section ABOVE---------
%% Find Dx Dy for max_corr between two maps
maxCoeff=0;
weather1618Modified = zeros(384,384); %create weather array for time range
%weather1618Modified(:) = {NaN}; % Matlab cannot mix cell & double
%%
for x = 1:384
for y = 1:384
%30 pixel appx.
for Dx = -max_displ:30: max_displ
for Dy = -max_displ:30: max_displ
%Limit range of x+Dx and y+Dy to 1:384
if x+Dx<1 | y+Dy<1 | x+Dx>384 | y+Dy>384
continue
%weather1618Modified is the forecasted weather1823
weather1618Modified(x+Dx,y+Dy) = weather1618(x,y)
%Find the best correlation; Is corrcoef the right formula?
newCoeff=corrcoef(weather1623,weather1618Modified);
if newCoeff>maxCoeff
maxCoeff=newCoeff;
bestDx=Dx;
bestDy=Dy;
end
end
end
end
end
end
%% Calc displacement
bestDispl = sqrt(bestDx^2+bestDy^2); %bestDispl for a 5 min frame
%Calc speed
speed = bestDispl/time;
You have to delete the continue statement after the first if (or place it somewhere else).
The continue statement makes the program skip the remaining part of the for-loop and go directly to the next iteration. Therefore bestDx and bestDy will never be set.
Documentation: https://se.mathworks.com/help/matlab/ref/continue.html

Saving output in the form of an array

I want to save the output out in the form of a column vector which is currently coming out as
ans(:,:,1) =
0
ans(:,:,2) =
0
ans(:,:,3) =
0
ans(:,:,4) =
0
ans(:,:,5) =
0
ans(:,:,6) =
0
ans(:,:,7) =
-5.5511e-017
function [out]= myfun1(in)
in= importdata('X.dat');
l= length(in);
dt=0.05;
l1=(l-1)*dt+1;
ts = timeseries(in, 1:.05:l1);
ts1= resample(ts, 1:1:l1);
out= ts1.data;
I think you might have a problem with the output as a whole. I assume that you want to resample the vector 'in' at sample points 1:1:l1. Resample should be called as:
resample(timeseriesVector, UpsampleValue, DownsampleValue)
This will generate a new timeseries vector at the rate of UpsampleValue/DownsampleValue.
I am sorry if I have misinterpreted your intentions. If you could provide some basic sample data it might be easier to debug.

how to calculate rolling volatility

I am trying to design a function that will calculate 30 day rolling volatility.
I have a file with 3 columns: date, and daily returns for 2 stocks.
How can I do this? I have a problem in summing the first 30 entries to get my vol.
Edit:
So it will read an excel file, with 3 columns: a date, and daily returns.
daily.ret = read.csv("abc.csv")
e.g. date stock1 stock2
01/01/2000 0.01 0.02
etc etc, with years of data. I want to calculate rolling 30 day annualised vol.
This is my function:
calc_30day_vol = function()
{
stock1 = abc$stock1^2
stock2 = abc$stock1^2
j = 30
approx_days_in_year = length(abc$stock1)/10
vol_1 = 1: length(a1)
vol_2 = 1: length(a2)
for (i in 1 : length(a1))
{
vol_1[j] = sqrt( (approx_days_in_year / 30 ) * rowSums(a1[i:j])
vol_2[j] = sqrt( (approx_days_in_year / 30 ) * rowSums(a2[i:j])
j = j + 1
}
}
So stock1, and stock 2 are the squared daily returns from the excel file, needed to calculate vol. Entries 1-30 for vol_1 and vol_2 are empty since we are calculating 30 day vol. I am trying to use the rowSums function to sum the squared daily returns for the first 30 entries, and then move down the index for each iteration.
So from day 1-30, day 2-31, day 3-32, etc, hence why I have defined "j".
I'm new at R, so apologies if this sounds rather silly.
This should get you started.
First I have to create some data that look like you describe
library(quantmod)
getSymbols(c("SPY", "DIA"), src='yahoo')
m <- merge(ROC(Ad(SPY)), ROC(Ad(DIA)), all=FALSE)[-1, ]
dat <- data.frame(date=format(index(m), "%m/%d/%Y"), coredata(m))
tmpfile <- tempfile()
write.csv(dat, file=tmpfile, row.names=FALSE)
Now I have a csv with data in your very specific format.
Use read.zoo to read csv and then convert to an xts object (there are lots of ways to read data into R. See R Data Import/Export)
r <- as.xts(read.zoo(tmpfile, sep=",", header=TRUE, format="%m/%d/%Y"))
# each column of r has daily log returns for a stock price series
# use `apply` to apply a function to each column.
vols.mat <- apply(r, 2, function(x) {
#use rolling 30 day window to calculate standard deviation.
#annualize by multiplying by square root of time
runSD(x, n=30) * sqrt(252)
})
#`apply` returns a `matrix`; `reclass` to `xts`
vols.xts <- reclass(vols.mat, r) #class as `xts` using attributes of `r`
tail(vols.xts)
# SPY.Adjusted DIA.Adjusted
#2012-06-22 0.1775730 0.1608266
#2012-06-25 0.1832145 0.1640912
#2012-06-26 0.1813581 0.1621459
#2012-06-27 0.1825636 0.1629997
#2012-06-28 0.1824120 0.1630481
#2012-06-29 0.1898351 0.1689990
#Clean-up
unlink(tmpfile)

Resources