I need to extract by unique genus (first part of the name of species) in one column but with by biggest number in another column in a CSV file when having multiples of the same name.
So if have multiple genus (same first name) then take the biggest number in the last column to select which will represent that genus.
I have extracted the information into arrays, but I am having trouble with combining the two in order to select. I was using
https://perlmaven.com/unique-values-in-an-array-in-perl
to help but I need to include biggest number in last column when have the same genus situation.
use strict;
use warnings;
open taxa_fh, '<', "$ARGV[0]" or die qq{Failed to open "$ARGV[0]" for input: $!\n};
open match_fh, ">$ARGV[0]_genusLongestLEN.csv" or die qq{Failed to open for output: $!\n};my #unique;
my %seen;
my %hash;
while ( my $line = <taxa_fh> ) {
chomp( $line );
my #parts = split( /,/, $line );
my #name = split( / /, $parts[3]);
my #A = $name[0];
my #B = $parts[5];
#seen{#A} = ();
my #merged = (#A, grep{!exists $seen{$_}} #B);
my #merged = (#A, #B);
#hash{#A} = #B;
print "$line\n";
}
close taxa_fh;
close match_fh;
Input example:
AB179735.1.1711,AB179735.1.1711,278983,Eucyrtidium hexagonatum,0,1600
AB179736.1.1725,AB179736.1.1725,278986,Pterocorys zancleus,0,1763
AB181888.1.1758,AB181888.1.1758,281609,Protoperidinium crassipes,0,1700
AB181890.1.1709,AB181890.1.1709,281610,Protoperidinium denticulatum,0,1800
AB181892.1.1738,AB181892.1.1738,281611,Protoperidinium divergens,0,1800
AB181894.1.1744,AB181894.1.1744,281612,Protoperidinium leonis,0,1500
AB181899.1.1746,AB181899.1.1746,281613,Protoperidinium pallidum,0,1600
AB181902.1.1741,AB181902.1.1741,261845,Protoperidinium pellucidum,0,1750
AB181904.1.1734,AB181904.1.1734,281614,Protoperidinium punctulatum,0,1599
AB181907.1.1687,AB181907.1.1687,281615,Protoperidinium thorianum,0,1600
AB120001.1.1725,AB120001.1.1725,244960,Gyrodinium spirale,0,1500
AB120002.1.1725,AB120002.1.1725,244961,Gyrodinium fusiforme,0,1800
AB120003.1.1724,AB120003.1.1724,244962,Gyrodinium rubrum,0,1700
AB120004.1.1723,AB120004.1.1723,244963,Gyrodinium helveticum,0,1500
AB120309.1.1800,AB120309.1.1800,4442,Camellia sinensis,0,1700
Wanted output:
AB179735.1.1711,AB179735.1.1711,278983,Eucyrtidium hexagonatum,0,1600
AB179736.1.1725,AB179736.1.1725,278986,Pterocorys zancleus,0,1763
AB181890.1.1709,AB181890.1.1709,281610,Protoperidinium denticulatum,0,1800
AB120002.1.1725,AB120002.1.1725,244961,Gyrodinium fusiforme,0,1800
AB120309.1.1800,AB120309.1.1800,4442,Camellia sinensis,0,1700
use Text::CSV_XS qw( );
my $csv = Text::CSV_XS->new({
auto_diag => 2,
binary => 1,
quote_space => 0,
});
my %by_genus;
while ( my $row = $csv->getline(\*ARGV) ) {
my ($genus) = split(' ', $row->[3]);
$by_genus{$genus} = $row
if !$by_genus{$genus}
|| $row->[5] > $by_genus{$genus}[5];
}
$csv->say(select(), $_) for values(%by_genus);
Properly naming the variables makes the code more readable:
#! /usr/bin/perl
use warnings;
use strict;
my %selected;
while (<>) {
my ($species, $value) = (split /,/)[3, 5];
my $genus = (split ' ', $species)[0];
if ($value > ($selected{$genus}{max} || 0)) {
$selected{$genus}{max} = $value;
$selected{$genus}{line} = $_;
}
}
for my $genus (keys %selected) {
print $selected{$genus}{line};
}
The order of the output lines is random.
You can this Perl command line as well
perl -F, -lane ' ($g=$F[3])=~s/(^\S+).*/$1/; if( $mx{$g}<$F[-1])
{ $kv{$g}=$_;$mx{$g}=$F[-1] } END { print $kv{$_} for(keys %kv) } ' file
with the given inputs in cara.txt file, the output is
$ perl -F, -lane ' ($g=$F[3])=~s/(^\S+).*/$1/; if( $mx{$g}<$F[-1])
{ $kv{$g}=$_;$mx{$g}=$F[-1] } END { print $kv{$_} for(keys %kv) } ' cara.txt
AB179736.1.1725,AB179736.1.1725,278986,Pterocorys zancleus,0,1763
AB179735.1.1711,AB179735.1.1711,278983,Eucyrtidium hexagonatum,0,1600
AB120309.1.1800,AB120309.1.1800,4442,Camellia sinensis,0,1700
AB120002.1.1725,AB120002.1.1725,244961,Gyrodinium fusiforme,0,1800
AB181890.1.1709,AB181890.1.1709,281610,Protoperidinium denticulatum,0,1800
$
Not fancy but gets the job done
#!/usr/bin/perl
use strict;
my #data = `cat /var/tmp/test.in`;
my %genuses = ();
foreach my $line ( #data ) {
chomp($line);
my #splitline = split(',', $line);
my $genus = $splitline[3];
my $num = $splitline[5];
my ( $name, $extra ) = split(' ', $genus);
if ( exists $genuses{$name}->{'num'} ) {
if ( $genuses{$name}->{'num'} < $num ) {
$genuses{$name}->{'num'} = $num;
$genuses{$name}->{'line'} = $line;
}
else {
next;
}
}
else {
$genuses{$name}->{'num'} = $num;
$genuses{$name}->{'line'} = $line;
}
}
foreach my $genus ( %genuses ) {
print "$genuses{$genus}->{'line'}";
print "\n";
}
Output:
[root#localhost tmp]# ./test.pl
AB179736.1.1725,AB179736.1.1725,278986,Pterocorys zancleus,0,1763
AB179735.1.1711,AB179735.1.1711,278983,Eucyrtidium hexagonatum,0,1600
AB120309.1.1800,AB120309.1.1800,4442,Camellia sinensis,0,1700
AB120002.1.1725,AB120002.1.1725,244961,Gyrodinium fusiforme,0,1800
AB181890.1.1709,AB181890.1.1709,281610,Protoperidinium denticulatum,0,1800
Don't see an obvious method that you are sorting your output by
I'm trying to save error codes by:
#global space
my #retCodes;
#main
sub BuildInit {
my $actionStr = "";
my $compStr = "";
my #component_dirs;
my #compToBeBuilt;
foreach my $comp (#compList) {
#component_dirs = GetDirs($comp); #populates #component_dirs
}
print "Printing Action List: #actionList\n";
#---------------------------------------
#---- Setup Worker Threads ----------
for ( 1 .. NUM_WORKERS ) {
async {
while ( defined( my $job = $q->dequeue() ) ) {
worker($job);
}
};
}
#-----------------------------------
#---- Enqueue The Work ----------
for my $action (#actionList) {
my $sem = Thread::Semaphore->new(0);
$q->enqueue( [ $_, $action, $sem ] ) for #component_dirs;
$sem->down( scalar #component_dirs );
print "\n------>> Waiting for prior actions to finish up... <<------\n";
}
# Nothing more to do - notify the Queue that we're not adding anything else
$q->end();
$_->join() for threads->list();
return 0;
}
#worker
sub worker {
my ($job) = #_;
my ( $component, $action, $sem ) = #$job;
Build( $component, $action );
$sem->up();
}
#builder method
sub Build {
my ( $comp, $action ) = #_;
my $cmd = "$MAKE $MAKE_INVOCATION_PATH/$comp ";
my $retCode = -1;
given ($action) {
when ("depend") { $cmd .= "$action >nul 2>&1" } #suppress output
when ("clean") { $cmd .= $action }
when ("build") { $cmd .= 'l1' }
when ("link") { $cmd .= '' } #add nothing; default is to link
default { die "Action: $action is unknown to me." }
}
print "\n\t\t*** Performing Action: \'$cmd\' on $comp ***" if $verbose;
if ( $action eq "link" ) {
# hack around potential race conditions -- will only be an issue during linking
my $tries = 1;
until ( $retCode == 0 or $tries == 0 ) {
last if ( $retCode = system($cmd) ) == 2; #compile error; stop trying
$tries--;
}
}
else {
$retCode = system($cmd);
}
push( #retCodes, ( $retCode >> 8 ) );
#testing
if ( $retCode != 0 ) {
print "\n\t\t*** ERROR IN $comp: $# !! ***\n";
print "\t\t*** Action: $cmd -->> Error Level: " . ( $retCode >> 8 ) . "\n";
#exit(-1);
}
return $retCode;
}
Error that gets displayed:
Use of uninitialized value $maxReturnCode in concatenation (.) or
string at C:\script.pl line 66, line 415.
I can see from the first line of output though, that I get things like: Return Code: 0 Return Code: 0 Return Code: 2 ..
The issue here is that the code isn't sharing the array between threads; so because of that, each thread is modifying it's local copy of the array, not the global array as expected. The fix for this problem is to share the variable, and lock it before accessing it during the thread processing:
my #retCodes;
share(#retCodes);
...
#during the thread sub
lock(#retCodes);
push(#retCodes, ($retCode>>8));
Here's a stubbed-out runnable version that you should be able to modify a bit to do what you need:
#!/usr/bin/perl
use strict;
use warnings;
use List::Util 'max';
use threads;
#global space
my #retCodes = ();
share(#retCodes);
sub builder {
my ($comp, $cmd) = ('builder', 'test');
for my $retCode (qw/0 0 256/) {
print "\n\t\tReturn Code: " . ($retCode >>8) . "\n";
lock(#retCodes);
push(#retCodes, ($retCode>>8));
}
}
#main
builder();
# other threads started...
# wait for threads to complete...
printf "Codes: %s\n", join(', ', #retCodes);
my $maxReturnCode = max(#retCodes);
print "Highest Error Code: $maxReturnCode\n"; #<-- crashes with error below
exit($maxReturnCode);
I have this perl script:
my %perMpPerMercHash;
foreach my $sheet () { #proper ranges specified
foreach my $row ( ) { #proper ranges specified
#required variables declared.
push(#{$perMpPerMercHash{join("-", $mercId, $mpId)}}, $mSku);
}
}
#Finally 'perMpPerMercHash' will be a hash of array`
foreach my $perMpPerMerc ( keys %perMpPerMercHash ) {
&genFile($perMpPerMerc, $perMpPerMercHash{$perMpPerMerc});
}
sub genFile {
my ( $outFileName, #skuArr ) = #_;
my $output = new IO::File(">$outFileName");
my $writer = new XML::Writer( OUTPUT => $output, DATA_MODE => 1, DATA_INDENT => 2);
#mpId is generated.
&prepareMessage($writer, $mpId, #skuArr);
}
sub prepareMessage {
my ( $writer, $mpId, #skuArr ) = #_;
my $count = 1;
print Dumper \#skuArr; #Printing correctly, 8-10 values.
foreach my $sku ( #skuArr ) { #not iterating.
print "loop run" , $sku, "\n"; #printed only once.
}
}
Can somebody please help why this is happening. I am new to perl and could not understand this anomaly.
EDIT:
output of Dumper:
$VAR1 = [
'A',
'B',
'C',
];
When you do
&genFile($perMpPerMerc, $perMpPerMercHash{$perMpPerMerc});
You're passing a reference to an array.
So in
sub genFile {
my ( $outFileName, #skuArr ) = #_;
You have to do :
sub genFile {
my ( $outFileName, $skuArr ) = #_;
and then use #$skuArr.
Have a look at references
The modified genFile sub will be:
sub genFile {
my ( $outFileName, $skuArr ) = #_;
my $output = new IO::File(">$outFileName");
my $writer = new XML::Writer( OUTPUT => $output, DATA_MODE => 1, DATA_INDENT => 2);
#mpId is generated.
&prepareMessage($writer, $mpId, #$skuArr);
}
And the other sub don't need to be modified.
Or you can pass always skuArr by reference:
&genFile($perMpPerMerc, $perMpPerMercHash{$perMpPerMerc});
...
sub genFile {
my ( $outFileName, $skuArr ) = #_;
...
&prepareMessage($writer, $mpId, $skuArr);
}
sub prepareMessage {
my ( $writer, $mpId, $skuArr ) = #_;
my $count = 1;
print Dumper $skuArr;
foreach my $sku ( #$skuArr ) {
print "loop run" , $sku, "\n";
}
}
I get this on my website:
Notice: Undefined offset: 1 in ...
Here is the full code, bolded is the part it refers to i think, basically this scripts should grab list of my channels from ther DB and then display info from the twitch:
<?php
defined('_JEXEC') or die('Direct access to this location is not allowed.');
$userList = $params->get('userlist');
$usersArray = explode(',', $userList);
$userGrab = "http://api.justin.tv/api/stream/list.json?channel=";
$checkedOnline = array ();
foreach($usersArray as $i =>$value){
$userGrab .= ",";
$userGrab .= $value;
}
unset($value);
$json_file = file_get_contents($userGrab, 0, null, null);
$json_array = json_decode($json_file, true);
//used to be $viewer = $json_array[$i]['stream_count'];
**foreach($usersArray as $i =>$value){
$title = $json_array[$i]['channel']['channel_url'];
$array = explode('/', $title);
$member = end($array);
$name = $json_array[$i]['name'];
$game = $json_array[$i]['meta_game'];
$viewer = $json_array[$i]['channel_count'];
$topic = $json_array[$i]['title'];
onlinecheck($member, $viewer, $topic, $game);
$checkedOnline[] = signin($member);
}**
unset($value);
unset($i);
function onlinecheck($online, $viewers, $topic, $game)
{
if ($game == "Counter-Strike: Global Offensive")
{
$igra = "csgo12.jpg";
}
else{
$igra = "online.png";
}
if ($online != null)
{
echo ' <img src="./modules/mod_twitchlist/tmpl/'.$igra.'">';
echo ' <strong>'.$online.'</strong>';
echo ' (' .$viewers.') - ';
echo '<strong>'.$topic.'</strong> </br>';
}
}
function signin($person){
if($person != null){
return $person;
}
else{
return null;
}
}
?>
<!-- <hr> -->
<?php
foreach ($usersArray as $i => $value1) {
foreach($checkedOnline as $ii => $value2){
if($value1 == $value2){
unset($usersArray[$i]);
}
}
}
$broj1 = count($usersArray); //neaktivni korisnici
$broj2 = count($checkedOnline); //ukupno streamova
if ($broj1 == $broj2){
echo '<strong><center>Nijedan stream nije aktivan trenutno!</center></strong>';
}
?>
Any hints?