Perl array overriding - arrays

I am trying to do sum of the array in my perl code but I was not able to get the right output.
Here is my sample code
use File::Find::Rule;
use Date::Parse;
my ($dir, $type, $fh, $line, $str_1,
$str_2,$str_3, $str_4);
my #array;
$dir = '/dir/test/';
$type = '*';
$str_1 = 'somestr1';
$str_2 = 'somestr2';
$str_3 = 'somestr3';
$str_4 = 'somestr4';
my #files = File::Find::Rule->file()->name($type)->in($dir);
open $out, '>>', "output_log" or die "Unable to open 'output_log' : $!";
print $out "\Logs \n";
print $out "--------------------------\n";
close $out or die "Unable to finish writing output_log : $!";
for my $file (#files) {
open $fh, '<', $file or die "can't open $file: $!";
open $out, '>>', "output_log" or die "Unable to open 'output_log' : $!";
while ( $line = <$fh> ) {
if ( $line !~ /$str_1/ && $line =~ /$str_2/ )
{
#array = $somevar # result of this loop 2,3
}
if ( $line !~ /$str_3/ && $line =~ /$str_4/ )
{
#array = $somevar #result of this loop 2,3,4,5,6
}
}
close $out or die "Unable to finish writing output_log : $!";
}
So Here is what I want
#array = (2,3,2,3,4,5,6)
and sum of #array
but unfornately,if i print this array that is running each and every line but instead i want to write a loop which stores the result in array from both if blocks..
now the code is overriding the #array in the second if block. Hope I made this clear!! Please help

#array = ... overwrites the contents of the array. Use push to add elements to an existing array.
For sum, see sum (or sum0) in List::Util.

Related

Convert string into array perl

I have a script which takes headers of a multi-fasta file and pushes them into an array. Then I want to loop through this array to find a specific pattern and perform some commands.
open(FH, '<', $ref_seq) or die $!;
while(<FH>){
$line = $_;
chomp $line;
if(m/^>([^\s]+)/){
$ref_header = $1;
print "$ref_header\n";
chomp $header;
if($1 eq $header){
$ref_header = $header;
#print "header is $ref_header\n";
}
}
}
This code prints headers like
chr1
chr2
chr3
How can I push these headers into an array?
I tried following code, but it splits individual letters, instead of $header_array[0] being chr1
#header_array = split(/\n*/, $ref_header);
print ("Here's the first element $header_array[0]");
Any help will be appreciated.
Shorten the code as shown below, removing some extra statements, and use push. You can combine push and the pattern match:
#!/usr/bin/env perl
use strict;
use warnings;
use Carp;
my $in_file = shift;
my #headers;
open my $in_fh, '<', $in_file or croak "cannot open $in_file: $!";
while ( <$in_fh> ) {
push #headers, />(\S+)/;
}
close $in_fh or croak "cannot close $in_file: $!";
print "#headers";
# Now, loop through headers and select the ones you need, for example:
for my $header ( #headers ) {
if ( $header =~ /foo/ ) {
# do something
}
}
A few suggestion on fixing your original code are below:
# Always use strict and use warnings.
# Remove extra parens and make the error message more informative:
open(FH, '<', $ref_seq) or die $!;
while(<FH>){
$line = $_;
chomp $line;
# [^\s] is simply \S:
if(m/^>([^\s]+)/){
$ref_header = $1;
print "$ref_header\n";
# where is $header coming from?
chomp $header;
# if the condition is satisfied, this assignment does not make sense:
# $ref_header is already the same as $header:
if($1 eq $header){
$ref_header = $header;
#print "header is $ref_header\n";
}
}
}
You can use push:
push #header_array, $ref_header;

How can I find the common items from two arrays fast?

I am trying to findy the common lines among two tab separated files based one field.
One line of the first file:
1 52854 s64199.1 A . . . PR GT 0/0
One line of the second file:
chr1 52854 . C T 215.302 . AB=0.692308;ABP=7.18621;AC=1;AF=0.5;AN=2;AO=9;CIGAR=1X;DP=13;DPB=13;DPRA=0;EPP=3.25157;EPPR=3.0103;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=1;NUMALT=1;ODDS=17.5429;PAIRED=0;PAIREDR=0.25;PAO=0;PQA=0;PQR=0;PRO=0;QA=318;QR=138;RO=4;RPP=3.25157;RPPR=5.18177;RUN=1;SAF=0;SAP=22.5536;SAR=9;SRF=1;SRP=5.18177;SRR=3;TYPE=snp;technology.illumina=1;BVAR GT:DP:RO:QR:AO:QA:GL 0/1:13:4:138:9:318:-5,0,-5
Based on the second field (52854) in this example I have a much.
Here is my code which finds the common ones, but my files are quite large and takes a lot of time.
Is there any way to speed up the process?
Thank you very much in advance.
#!/app/languages/perl/5.14.2/bin/perl
use strict;
use warnings;
my $map_file = $ARGV[0];
my $vcf_file = $ARGV[1];
open my $map_info, $map_file or die "Could not open $map_file: $!";
my #map_array = ();
my #vcf_array = ();
while( my $mline = <$map_info>) {
chomp $mline;
my #data1 = split('\t', $mline);
my $pos1 = $data1[1];
push (#map_array, $pos1);
}
open my $vcf_info, $vcf_file or die "Could not open $vcf_file: $!";
while( my $line = <$vcf_info>) {
if ($line !~ m/^#/) {
push (#vcf_array, $line);
}
}
foreach my $a (#map_array) {
chomp $a;
foreach my $b (#vcf_array) {
chomp $b;
my #data = split('\t', $b);
my $pos2 = $data[1];
my $ref2 = $data[3];
my $allele = $data[4];
my $genotype = $data[9];
if ($a == $pos2) {
print $pos2 . "\t" . $ref2. "\t".$allele."\t".$genotype. "\n";
#print "$b\n";
}
}
}
Here's a version that should run much faster than your own
It reads the map file and stores each pos field in hash %wanted. Then it reads through the second file and checks whether the record is in the list of wanted values. If so then it splits the record and prints the fields you require
Note that I haven't been able to test this beyond making sure that it compiles
use strict;
use warnings;
use 5.010;
use autodie;
my ( $map_file, $vcf_file ) = #ARGV;
my %wanted;
{
open my $map_fh, '<', $map_file;
while ( <$map_fh> ) {
chomp;
my $pos = ( split /\t/, $_, 3 )[1];
++$wanted{$pos};
}
}
{
open my $vcf_fh, '<', $vcf_file;
while ( <$vcf_fh> ) {
next if /^#/;
chomp;
my $pos = ( split /\t/, $_, 3 )[1];
next unless $wanted{$pos};
my ( $ref, $allele, $genotype ) = ( split /\t/ )[3, 4, 9];
print join("\t", $pos, $ref, $allele, $genotype), "\n";
}
}
Below please find minimal modification of your script for hash based searches
use strict;
use warnings;
my $map_file = $ARGV[0];
my $vcf_file = $ARGV[1];
my %vcf_hash;
open( my $vcf_info, $vcf_file) or die "Could not open $vcf_file: $!";
while( my $line = <$vcf_info>) {
next if $line =~ m/^#/; # Skip comment lines
chomp $line;
my (#data) = split(/\t/, $line);
die unless #data >= 10; # Check number of fields in the input line
my ($pos) = $data[1];
# $. - line number in the file
$vcf_hash{$pos}{$.} = \#data;
}
open( my $map_info, $map_file) or die "Could not open $map_file: $!";
while( my $mline = <$map_info>) {
chomp $mline;
my (#data) = split(/\t/, $mline);
die unless #data >= 2; # Check number of fields in the input line
my ($pos) = $data[1];
if( exists $vcf_hash{$pos}) {
my $hash_ref = $vcf_hash{$pos};
for my $n (sort{$a<=>$b} keys %$hash_ref) {
my $array_ref = $hash_ref->{$n};
my $pos2 = $array_ref->[1];
my $ref2 = $array_ref->[3];
my $allele = $array_ref->[4];
my $genotype = $array_ref->[9];
print $pos2 . "\t" . $ref2. "\t".$allele."\t".$genotype. "\n";
}
}
}
The script may be improved further to reduce memory use if you use huge data files.
There is not a need to keep your map_file in memory but just keys. It is good to make them keys in a hash which you use for existence checking. You don't have to keep your vcf_file in memory as well, but you can just make a decision to output or not.
#!/app/languages/perl/5.14.2/bin/perl
use strict;
use warnings;
use autodie;
use constant KEY => 1;
use constant FIELDS => ( 1, 3, 4, 9 );
my ( $map_file, $vcf_file ) = #ARGV;
my %map;
{
my $fh;
open $fh, '<', $map_file;
while (<$fh>) {
$map{ ( split /\t/, $_, KEY + 2 )[KEY] } = undef;
}
}
{
my $fh;
open $fh, '<', $vcf_file;
while (<$fh>) {
next if /^#/;
chomp;
my #data = split /\t/;
print join "\t", #data[FIELDS] if exists $map{ $data[KEY] };
}
}

To remove duplicate elements from an array in Perl

I have a data set
10-101570715-101609901-hsa-mir-3158-1 10-101600739-101609661-ENSG00000166171 10-101588288-101609668-ENSG00000166171 10-101588325-101609447-ENSG00000166171 10-101594702-101609439-ENSG00000166171 10-101570560-101596651-ENSG00000166171
10-103389007-103396515-hsa-mir-1307 10-103389041-103396023-ENSG00000173915 10-103389050-103396074-ENSG00000173915 10-103389050-103396441-ENSG00000173915 10-103389050-103396466-ENSG00000173915 10-103389050-103396466-ENSG00000173915
Except for the first element in each line, I have multiple values, which are redundant and I want to remove the redundant values. I have written a code but I don't feel its working fine.
open (fh, "file1");
while ($line=<fh>)
{
chomp ($line);
#array=$line;
my #unique = ();
my %Seen = ();
foreach my $elem ( #array )
{
next if $Seen{ $elem }++;
push #unique, $elem;
}
print #unique;
}
a hash is for duplicate detection :
my %seen;
my #removeduplicate = grep { !$seen{$_}++ } #array;
For me below code is working fine :
use strict;
use warnings;
my %seen;
open my $fh, "<", 'file.txt' or die "couldn't open : $!";
while ( my $line = <$fh>)
{
chomp $line;
my #array = split (' ', $line);
my #removeduplicate = grep { !$seen{$_}++ } #array;
print "#removeduplicate\n";
}

print specific word staring with in text and count

I like to find word start with sid=word and sid=text and print and count it the same word.
sid=word 2
sid=text 5
I have try make some script
use warnings;
use strict;
my $input = 'input.txt';
my $output = 'output.txt';
open (FILE, "<", $input) or die "Can not open $input $!";
open my $out, '>', $output or die "Can not open $output $!";
while (<FILE>){
foreach my #arr = /(?: ^|\s )(sid=\S*) {
$count{$arr}++;
}
}
foreach my #arr (sort keys %count){
printf "%-31s %s\n", $str, $count{$arr};
}
but show error missing $ on loop variable
anyone can help me out what i miss.
thanks.
This should produce desired output to output.txt, with words in order of appearance
use warnings;
use strict;
my $input = 'input.txt';
my $output = 'output.txt';
open (my $FILE, "<", $input) or die "Can not open $input $!";
open (my $out, ">", $output) or die "Can not open $output $!";
my (%count, #arr);
while (<$FILE>){
if ( /(?: ^|\s )(sid=\S*)/x ) {
push #arr, $1 if !$count{$1};
$count{$1}++;
}
}
foreach my $str (#arr) {
print $out sprintf("%-31s %s\n", $str, $count{$str});
}

Create CSV file from 2d array perl

I am trying to load a csv file, transpose it and write a new one. I have everything working correctly except writing a new file. I have looked around online without success.
use strict;
use warnings;
use Text::CSV;
use Data::Dump qw(dump);
use Array::Transpose;
my #data; # 2D array for CSV data
my $file = 'sample_array.csv';
my $csv = Text::CSV->new;
open my $fh, '<', $file or die "Could not open $file: $!";
while( my $row = $csv->getline( $fh ) ) {
shift #$row; # throw away first value
push #data, $row;
}
#data=transpose(\#data);
dump(#data);
The output here is the transposed array #data (["blah", 23, 22, 43], ["tk1", 1, 11, 15],["huh", 5, 55, 55]). I need that output to be written to a new CSV file.
CSV file:
text,blah,tkl,huh
14,23,1,5
12,22,11,55
23,42,15,55
Refer to the code after dump. This was derived from the Text::CSV SYNOPSIS:
use strict;
use warnings;
use Text::CSV;
use Data::Dump qw(dump);
use Array::Transpose;
my #data; # 2D array for CSV data
my $file = 'sample_array.csv';
my $csv = Text::CSV->new;
open my $fh, '<', $file or die "Could not open $file: $!";
while( my $row = $csv->getline( $fh ) ) {
shift #$row; # throw away first value
push #data, $row;
}
#data=transpose(\#data);
dump(#data);
open $fh, ">:encoding(utf8)", "new.csv" or die "new.csv: $!";
for (#data) {
$csv->print($fh, $_);
print $fh "\n";
}
close $fh or die "new.csv: $!";
Along with Toolic's addition I had to make some edits due to the specific type of data I was dealing with. This was an extremely large set with engineering symbols & units and negative numbers with long decimals. For reference, my final code is below.
use strict;
use warnings;
use Text::CSV;
use Data::Dump qw(dump);
use Array::Transpose;
my #data; # 2D array for CSV data
my $file = 'rawdata.csv';
my $csv = Text::CSV->new({ binary => 1, quote_null => 0 });
open my $fh, '<', $file or die "Could not open $file: $!";
while( my $row = $csv->getline( $fh ) ) {
#shift #$row; # throw away first value, I needed the first values.
push #data, $row;
}
#data=transpose(\#data);
open $fh, ">:encoding(utf8)", "rawdata_trans.csv" or die "rawdata_trans.csv: $!";
for (#data) {
$csv->print($fh, $_);
print $fh "\n";
}
close $fh or die "rawdata_trans.csv: $!";

Resources