Conditionally replace multiple values in multiple columns - arrays

I have a coma delimited file where some values can be missing like
1,f,12,f,t,18
2,t,17,t, ,17
3,t,15, ,f,16
I want to change some of the columns to numeric; f to 0 and t to 1. Here, I want to change only columns 2 and 5 and don't want to change column 4. I my result file should look like
1,0,12,f,1,18
2,1,17,t, ,17
3,1,15, ,0,16
I can use the statement
awk -F, -v OFS=',' '{ if ( $2 ~ /t/ ) { $2 = 1 } else if ( $2 ~ /f/ ) { $2 = 0 }; print}' test.csv
To change individual columns
I can also use a loop like
awk -F, -v OFS=',' 'BEGIN {
IFS = OFS = ","
}
{
for (column = 1; column <= 4; ++column) {
if ($column ~ /t/) {
$column = 1
}
else if($column ~ /f/) {
$column = 0
}
}
print
}
' test.csv
to replace multiple columns if they are together. How do I change the for loop to specify only the specific columns? I know there is a for each loop to do the same but I couldn't get it to work. Also how can I assign multiple variables to the array in a single statement like
a =[1, 2, 3, 4]

You can use this awk:
awk 'BEGIN{ FS=OFS=","; a[2]; a[5] }
{ for (i in a) if ($i=="f") $i=0; else if ($i=="t") $i=1 } 1' file
1,0,12,f,1,18
2,1,17,t, ,17
3,1,15, ,0,16

Related

Troubleshoot for loop with arithmetic in shell

I'm trying to write a loop that pulls sequencing metrics from column 2 of a txt file (ending in full_results.txt) and writes everything into a combined tsv (ideally with headers as well, but I haven't been able to do that).
The below makes a tsv with all of the columns I want--but stopped looping. The loop was working before I added the last two columns, so I'm not sure if adding the fields with arithmetic changed anything. Let me know if there is a cleaner way to write this! Thanks for your input.
{
for i in *full_results.txt;
do
[ -d "$i" ] && continue
[ -s "$1" ] && continue
total=$(awk ' $1=="NORMALIZED_READ_COUNT" { print $2 } ' $i)
trimmed=$(awk ' $1=="RAW_FRAGMENT_TOTAL" { print $2 } ' $i)
aligned=$(awk ' $1=="RAW_FRAGMENT_FILTERED" { print $2 } ' $i)
molbin=$(awk ' $1=="AVERAGE_UNIQUE_DNA_READS_PER_GSP2" { print $2 } ' $i)
startsite=$(awk ' $1=="AVERAGE_UNIQUE_DNA_START_SITES_PER_GSP2" { print $2 } ' $i)
dedup=$(awk ' $1=="ON_TARGET_DEDUPLICATION_RATIO" { print $2 } ' $i)
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "$i" $total $trimmed $aligned $molbin $startsite $dedup $((total - trimmed)) "$(( ( (total - trimmed) * 100) / total))"
done;
} > sequencing_metrics.tsv
Output:
NR02_31_S31_merged_R1_001_full_results.txt 7095319 6207119 6206544 1224.43 391.65 2.74:1 888200 12
Intended output: the same as above but looped for all files in the folder

Getting all values of various rows which have the same value in one column with awk

I have a data set (test-file.csv) with tree columns:
node,contact,mail
AAAA,Peter,peter#anything.com
BBBB,Hans,hans#anything.com
CCCC,Dieter,dieter#anything.com
ABABA,Peter,peter#anything.com
CCDDA,Hans,hans#anything.com
I like to extend the header by the column count and rename node to nodes.
Furthermore all entries should be sorted after the second column (mail).
In the column count I like to get the number of occurences of the column mail,
in nodes all the entries having the same value in the column mail should be printed (space separated and alphabetically sorted).
This is what I try to achieve:
contact,mail,count,nodes
Dieter,dieter#anything,com,1,CCCC
Hans,hans#anything.com,2,BBBB CCDDA
Peter,peter#anything,com,2,AAAA ABABA
I have this awk-command:
awk -F"," '
BEGIN{
FS=OFS=",";
printf "%s,%s,%s,%s\n", "contact","mail","count","nodes"
}
NR>1{
counts[$3]++; # Increment count of lines.
contact[$2]; # contact
}
END {
# Iterate over all third-column values.
for (x in counts) {
printf "%s,%s,%s,%s\n", contact[x],x,counts[x],"nodes"
}
}
' test-file.csv | sort --field-separator="," --key=2 -n
However this is my result :-(
Nothing but the amount of occurences work.
,Dieter#anything.com,1,nodes
,hans#anything.com,2,nodes
,peter#anything.com,2,nodes
contact,mail,count,nodes
Any help appreciated!
You may use this gnu awk:
awk '
BEGIN {
FS = OFS = ","
printf "%s,%s,%s,%s\n", "contact","mail","count","nodes"
}
NR > 1 {
++counts[$3] # Increment count of lines.
name[$3] = $2
map[$3] = ($3 in map ? map[$3] " " : "") $1
}
END {
# Iterate over all third-column values.
PROCINFO["sorted_in"]="#ind_str_asc";
for (k in counts)
print name[k], k, counts[k], map[k]
}
' test-file.csv
Output:
contact,mail,count,nodes
Dieter,dieter#anything.com,1,CCCC
Hans,hans#anything.com,2,BBBB CCDDA
Peter,peter#anything.com,2,AAAA ABABA
With your shown samples please try following. Written and tested in GNU awk.
awk '
BEGIN{ FS=OFS="," }
FNR==1{
sub(/^[^,]*,/,"")
$1=$1
print $0,"count,nodes"
}
FNR>1{
nf=$2
mail[nf]=$NF
NF--
arr[nf]++
val[nf]=(val[nf]?val[nf] " ":"")$1
}
END{
for(i in arr){
print i,mail[i],arr[i],val[i] | "sort -t, -k1"
}
}
' Input_file
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
BEGIN{ FS=OFS="," } ##In BEGIN section setting FS, OFS as comma here.
FNR==1{ ##if this is first line then do following.
sub(/^[^,]*,/,"") ##Substituting everything till 1st comma here with NULL in current line.
$1=$1 ##Reassigning 1st field to itself.
print $0,"count,nodes" ##Printing headers as per need to terminal.
}
FNR>1{ ##If line is Greater than 1st line then do following.
nf=$2 ##Creating nf with 2nd field value here.
mail[nf]=$NF ##Creating mail with nf as index and value is last field value.
NF-- ##Decreasing value of current number of fields by 1 here.
arr[nf]++ ##Creating arr with index of nf and keep increasing its value with 1 here.
val[nf]=(val[nf]?val[nf] " ":"")$1 ##Creating val with index of nf and keep adding $1 value in it.
}
END{ ##Starting END block of this program from here.
for(i in arr){ ##Traversing through arr in here.
print i,mail[i],arr[i],val[i] | "sort -t, -k1" ##printing values to get expected output and sorting it also by pipe here as per requirement.
}
}
' Input_file ##Mentioning Input_file name here.
2nd solution: In case you want to sort by 2nd and 3rd fields then try following.
awk '
BEGIN{ FS=OFS="," }
FNR==1{
sub(/^[^,]*,/,"")
$1=$1
print $0,"count,nodes"
}
FNR>1{
nf=$2 OFS $3
NF--
arr[nf]++
val[nf]=(val[nf]?val[nf] " ":"")$1
}
END{
for(i in arr){
print i,arr[i],val[i] | "sort -t, -k1"
}
}
' Input_file

How to find maximum value in a column with awk

I have a file with two sets of data divided by a blank line:
a 3
b 2
c 1
e 5
d 8
f 1
Is there a way to find the maximum value of the second column in each set and print the corresponding line with awk ? The result should be:
b 3
d 8
Thank you.
Could you please try following, written and tested based on your shown samples in GNU awk.
awk '
!NF{
if(max!=""){ print arr[max],max }
max=""
}
{
max=( (max<$2) || (max=="") ? $2 : max )
arr[$2]=$1
}
END{
if(max!=""){ print arr[max],max }
}
' Input_file
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
!NF{ ##if NF is NULL then do following.
if(max!=""){ print arr[max],max } ##Checking if max is SET then print arr[max] and max.
max="" ##Nullifying max here.
}
{
max=( (max<$2) || (max=="") ? $2 : max ) ##Checking condition if max is greater than 2nd field then keep it as max or change max value as 2nd field.
arr[$2]=$1 ##Creating arr with 2nd field index and 1st field as value.
}
END{ ##Starting END block of this program from here.
if(max!=""){ print arr[max],max } ##Checking if max is SET then print arr[max] and max.
}
' Input_file ##mentioning Input_file name here.
You may use this alternate gnu awk:
awk -v RS= '{
max=""
split($0, a, /[^[:space:]]+/, m)
for (i=1; i in m; i+=2)
if (!max || m[i+1] > max) {
mi = i
max = m[i+1]
}
print m[mi], m[mi+1]
}' file
a 3
d 8
Another awk:
$ awk '
!$0 {
print n
m=n=""
}
$2>m {
m=$2
n=$0
}
END {
print n
}' file
Output:
a 3
d 8
another awk
$ awk '{cmd="sort -k2nr | head -1"} !NF{close(cmd)} {print | cmd}' file
a 3
d 8
runs the command for each block to find the block max.
You could try to separate the data sets by doing:
awk -v RS= 'NR == 1 {print}' yourfile > anotherfile
This will return the first data set then you change NF == 2 to get
the second data set,
and then find the maximum in each data set like suggested in
here

Perl: Inserting values into specific columns of CSV file

I have CSV data of the form:
S.No,Label,Customer1,Customer2,Customer3...
1,label1,Y,N,Y
2,label2,N,Y,N
...
I need to reproduce the "label" to the left of "customer" columns marked with Y - and have nothing ("") to the left of columns marked with N.
Expected output:
S.No,Label,Customer1,Customer1,Customer2,Customer2,Customer3,Customer3...
1,label1,label1,Y,"",N,label1,Y
2,label2,"",N,label2,Y,"",N
When opened using Excel, it would look like this:
S.No Label Customer1 Customer1 Customer2 Customer2 Customer3 Customer3...
1 label1 label1 Y N label1 Y
2 label2 N label2 Y N
The two leftmost columns, referring to S.No and the original "Label" column, are constant.
What is the simplest way to do this? I tried the following code:
use strict;
use warnings;
my $nonIncludesFile = "nonIncludes.csv";
open(my $xfh, "+>", $nonIncludesFile) or warn "Unable to open $nonIncludesFile, $!";
chomp( my $header = <$xfh> );
my #names = split ",", $header;
my #names1;
my #fields;
my #fields1;
for(my $j=0; $j< scalar(#names); $j++)
{
$names1[$j] = $names[$j];
}
while(<$xfh>)
{
my $nonIncLine = $_;
$nonIncLine = chomp($nonIncLine);
#fields = split ",", $nonIncLine;
next if $. == 1; #skip the first line
for(my $i = 0; $i < scalar(#fields) -2; $i++) #Number of "customers" = scalar(#fields) -2
{
$fields1[0] = $fields[0];
$fields1[1] = $fields[1];
if('Y' eq $fields[ $i + 2 ])
{
$fields1[$i+2] = 'Y';
substr(#fields1, $i + 1, 0, $fields[1]); #insert the label to the left - HERE
}
else
{
$fields1[$i+2] = 'N';
substr(#fields1, $i + 1, 0, "");
}
}
}
print $xfh #names1;
print $xfh #fields1;
close($xfh);
This however complains of "substr outside of string" at the line marked by "HERE".
What am I doing wrong? And is there any simpler (and better) way to do this?
Something like this maybe?
#!/usr/bin/perl
use strict;
use warnings;
#read the header row
chomp( my ( $sn, $label, #customers ) = split( /,/, <DATA> ) );
#double the 'customers' column headings (one is suffixed "_label")
print join( ",", $sn, $label, map { $_ . "_label", $_ } #customers ), "\n";
#iterate data
while (<DATA>) {
#strip trailing linefeed
chomp;
#extract fields with split - note breaks if you've quoted commas inline.
my ( $sn, $label, #row ) = split /,/;
print "$sn,$label,";
#iterate Y/N values, and either prints "Y" + label, or anything else + blank.
foreach my $value (#row) {
print join( ",", $value eq "Y" ? $label : "", $value ),",";
}
print "\n";
}
__DATA__
S.No,Label,Customer1,Customer2,Customer3
1,label1,Y,N,Y
2,label2,N,Y,N
Assumes you don't have any fruity special characters (e.g. commas) in the fields, because it'll break if you do, and you might want to consider Text::CSV instead.
It is always much better to post some usable test data than write a something like this question
However, it looks like your data has no quoted fields or escaped characters, so it looks like you can just use split and join to process the CSV data
Here's a sample Perl program that fulfils your requirement. The example output uses your data as it is. Each line of data has to be processed backwards so that the insertions don't affect the indices of elements that are yet to be processed
use strict;
use warnings 'all';
use feature 'say';
while ( <DATA> ) {
chomp;
my #fields = split /,/;
for ( my $i = $#fields; $i > 1; --$i ) {
my $newval =
$. == 1 ? $fields[$i] :
lc $fields[$i] eq 'y' ? $fields[1] :
'';
splice #fields, $i, 0, $newval;
}
say join ',', #fields;
}
__DATA__
S.No,Label,Customer1,Customer2,Customer3...
1,label1,Y,N,Y
2,label2,N,Y,N
output
S.No,Label,Customer1,Customer1,Customer2,Customer2,Customer3...,Customer3...
1,label1,label1,Y,,N,label1,Y
2,label2,,N,label2,Y,,N

Extract columns from every six lines

I have a file that looks like this:
194170,46.9,42.2
194170,47.7,40.0
194170,48.5,42.0
194170,48.6,43.0
194170,49.8,39.2
194170,50.2,43.3
194179,44.9,36.9
194179,45.3,36.3
194179,46.4,36.9
194179,47.5,34.4
194179,48.0,40.0
194179,49.6,37.1
194184,52.8,51.1
194184,52.9,49.8
194184,54.0,51.9
194184,56.8,54.9
194184,57.6,53.6
194184,57.8,52.9
...
For a given line, the first number is an ID, and the second and third number are what I'm interested in. For those lines with the same ID (that is, every six lines), the numbers in the same column are numbers for consecutive years. I want to end up with a file that looks like this:
194170,46.9,47.7,48.5,48.6,49.8,50.2
194170,42.2,40.0,42.0,43.0,39.2,43.3
194179,44.9,45.3,46.4,47.5,48.0,49.6
194179,36.9,36.3,36.9,34.4,40.0,37.1
That is, for lines with the same ID, I want to group the consecutive numbers from the second column together, and likewise with the third column.
Is this possible to do with awk/sed/others?
Another answer with awk:
awk -F, '{a[$1] = a[$1]","$2}END{for(i in a) print i a[i]}' yourfile
For two columns:
awk -F, '{a[$1] = a[$1]","$2;b[$1] = b[$1]","$3}END{for(i in a) print i a[i]"\n"i b[i]}' yourfile
Anyway, I prefer tidyR in R for that kind of task.
With awk:
awk -F',' '{ a[$1] = a[$1] ? a[$1] FS $2 : $2 ; b[$1] = b[$1] ? b[$1] FS $3 : $3}
END { for(idx in a){ print idx,a[idx] ; print idx,b[idx]}}' yourfile
Explanation:
-F Field separator
a[] will have second column values
b[] will have third column values
END{} printing the values
Example:
$ awk -F',' '{ a[$1] = a[$1] ? a[$1] FS $2 : $2 ; b[$1] = b[$1] ? b[$1] FS $3 : $3}
END { for(idx in a){ print idx,a[idx] ; print idx,b[idx]}}' yourfile
194170 46.9,47.7,48.5,48.6,49.8,50.2
194170 42.2,40.0,42.0,43.0,39.2,43.3
194184 52.8,52.9,54.0,56.8,57.6,57.8
194184 51.1,49.8,51.9,54.9,53.6,52.9
194179 44.9,45.3,46.4,47.5,48.0,49.6
194179 36.9,36.3,36.9,34.4,40.0,37.1
Another awk version, which doesn't use arrays and maintains the original order (not using arrays could be an advantage if it's a very large file that you don't want to have to load all of into memory before printing --- otherwise, the array version is fine, assuming you don't care about the ordering).
BEGIN { FS = OFS = "," }
!prev_id { prev_id = $1 }
$1 == prev_id { r1 = r1 OFS $2; r2 = r2 OFS $3 }
$1 != prev_id { print prev_id r1 ORS prev_id r2;
r1 = OFS $2; r2 = OFS $3; prev_id = $1 }
END { print prev_id r1 ORS prev_id r2 }
$ awk -f v3.awk file.txt
194170,46.9,47.7,48.5,48.6,49.8,50.2
194170,42.2,40.0,42.0,43.0,39.2,43.3
194179,44.9,45.3,46.4,47.5,48.0,49.6
194179,36.9,36.3,36.9,34.4,40.0,37.1
194184,52.8,52.9,54.0,56.8,57.6,57.8
194184,51.1,49.8,51.9,54.9,53.6,52.9

Resources