Perl and Foreach loop with Splitting - arrays

I have a Problem about the Foreach Loop and Splitting with Perl.
I want to loop the Arrays and split it to name and value.
1.) I read the File and save to an String.
2.) I split the empty line and save to an Array.
My Script:
#!/usr/bin/perl
use strict;
my $pathconfigfile = 'config.conf';
my #configline;
open(my $configfile, "<", $pathconfigfile);
local $/;
my #configdata = split("\n\n",<$configfile>);
#print $configdata[0], "\n";
#print $configdata[1], "\n";
#print $configdata[2], "\n";
foreach my $data (#configdata){
my #editing = split /#/, $data;
my ($name, $value) = #editing[0,1];
print $name "\n";
print $value "\n";
}
close $configfile;
Configfile:
Testingtttttttttttttttttttttttt
############################################
0987654345678909876MN09uz6t56789oiuhgölkjhgfr
0987654323456789098765fgnloiuztlkjhgfrtzuiknb
MegaMixoiuzt
############################################
09876543457890098765NSUDlkjhzgtfr67899ztz9098
098765435678987t87656789876567898765679097658
TESTINGPARTS
############################################
0987654567890098765hzzasza654567uhgdjdjfacdaa
9876545678987654mchfuiaq754567898765434567876
My wish result:
$name = Testingtttttttttttttttttttttttt
$value = 0987654345678909876MN09uz6t56789oiuhgölkjhgfr
0987654323456789098765fgnloiuztlkjhgfrtzuiknb
$name = MegaMixoiuzt
$value = 09876543457890098765NSUDlkjhzgtfr67899ztz9098
098765435678987t8765678987656789876567909765
$name = TESTINGPARTS
$value = 0987654567890098765hzzasza654567uhgdjdjfacdaa
9876545678987654mchfuiaq754567898765434567876

split /#/ # A "#" separates the two.
should be
split /\n#+\n/ # A line of "#" separates the two.
With other improvements:
#!/usr/bin/perl
use strict;
use warnings;
my $config_qfn = 'config.conf';
open(my $config_fh, "<", $config_qfn )
or die("Can't open \"$config_qfn\": $!\n");
local $/ = ""; # Paragraph mode
while (my $rec = <$config_fh>) {
my ($name, $value) = split(/\n#+\n/, $rec);
print "\$name = $name\n";
print "\$value = $value\n";
}

Also something like this without foreach:
use strict;
use warnings;
open my $fh, '<config.conf' or die "$!"; my $data = join '', <$fh>; close $fh;
my %hash = $data =~ /^(.+)\n#+\n(\S+\n\S+)/mg;
print "NAME: $_\nVALUE: $hash{$_}\n\n" for keys %hash

You want to split by multiple # so use #+
+ match one or more times.
Try it
#!/usr/bin/perl
use strict;
my $pathconfigfile = 'config.conf';
my #configline;
open(my $configfile, "<", $pathconfigfile);
local $/;
my #configdata = split("\n\n",<$configfile>);
foreach my $data (#configdata){
my ($name,$value) = split /\n#+\n/, $data;
print "$name $value\n\n";
}

Related

How can I find the common items from two arrays fast?

I am trying to findy the common lines among two tab separated files based one field.
One line of the first file:
1 52854 s64199.1 A . . . PR GT 0/0
One line of the second file:
chr1 52854 . C T 215.302 . AB=0.692308;ABP=7.18621;AC=1;AF=0.5;AN=2;AO=9;CIGAR=1X;DP=13;DPB=13;DPRA=0;EPP=3.25157;EPPR=3.0103;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=1;NUMALT=1;ODDS=17.5429;PAIRED=0;PAIREDR=0.25;PAO=0;PQA=0;PQR=0;PRO=0;QA=318;QR=138;RO=4;RPP=3.25157;RPPR=5.18177;RUN=1;SAF=0;SAP=22.5536;SAR=9;SRF=1;SRP=5.18177;SRR=3;TYPE=snp;technology.illumina=1;BVAR GT:DP:RO:QR:AO:QA:GL 0/1:13:4:138:9:318:-5,0,-5
Based on the second field (52854) in this example I have a much.
Here is my code which finds the common ones, but my files are quite large and takes a lot of time.
Is there any way to speed up the process?
Thank you very much in advance.
#!/app/languages/perl/5.14.2/bin/perl
use strict;
use warnings;
my $map_file = $ARGV[0];
my $vcf_file = $ARGV[1];
open my $map_info, $map_file or die "Could not open $map_file: $!";
my #map_array = ();
my #vcf_array = ();
while( my $mline = <$map_info>) {
chomp $mline;
my #data1 = split('\t', $mline);
my $pos1 = $data1[1];
push (#map_array, $pos1);
}
open my $vcf_info, $vcf_file or die "Could not open $vcf_file: $!";
while( my $line = <$vcf_info>) {
if ($line !~ m/^#/) {
push (#vcf_array, $line);
}
}
foreach my $a (#map_array) {
chomp $a;
foreach my $b (#vcf_array) {
chomp $b;
my #data = split('\t', $b);
my $pos2 = $data[1];
my $ref2 = $data[3];
my $allele = $data[4];
my $genotype = $data[9];
if ($a == $pos2) {
print $pos2 . "\t" . $ref2. "\t".$allele."\t".$genotype. "\n";
#print "$b\n";
}
}
}
Here's a version that should run much faster than your own
It reads the map file and stores each pos field in hash %wanted. Then it reads through the second file and checks whether the record is in the list of wanted values. If so then it splits the record and prints the fields you require
Note that I haven't been able to test this beyond making sure that it compiles
use strict;
use warnings;
use 5.010;
use autodie;
my ( $map_file, $vcf_file ) = #ARGV;
my %wanted;
{
open my $map_fh, '<', $map_file;
while ( <$map_fh> ) {
chomp;
my $pos = ( split /\t/, $_, 3 )[1];
++$wanted{$pos};
}
}
{
open my $vcf_fh, '<', $vcf_file;
while ( <$vcf_fh> ) {
next if /^#/;
chomp;
my $pos = ( split /\t/, $_, 3 )[1];
next unless $wanted{$pos};
my ( $ref, $allele, $genotype ) = ( split /\t/ )[3, 4, 9];
print join("\t", $pos, $ref, $allele, $genotype), "\n";
}
}
Below please find minimal modification of your script for hash based searches
use strict;
use warnings;
my $map_file = $ARGV[0];
my $vcf_file = $ARGV[1];
my %vcf_hash;
open( my $vcf_info, $vcf_file) or die "Could not open $vcf_file: $!";
while( my $line = <$vcf_info>) {
next if $line =~ m/^#/; # Skip comment lines
chomp $line;
my (#data) = split(/\t/, $line);
die unless #data >= 10; # Check number of fields in the input line
my ($pos) = $data[1];
# $. - line number in the file
$vcf_hash{$pos}{$.} = \#data;
}
open( my $map_info, $map_file) or die "Could not open $map_file: $!";
while( my $mline = <$map_info>) {
chomp $mline;
my (#data) = split(/\t/, $mline);
die unless #data >= 2; # Check number of fields in the input line
my ($pos) = $data[1];
if( exists $vcf_hash{$pos}) {
my $hash_ref = $vcf_hash{$pos};
for my $n (sort{$a<=>$b} keys %$hash_ref) {
my $array_ref = $hash_ref->{$n};
my $pos2 = $array_ref->[1];
my $ref2 = $array_ref->[3];
my $allele = $array_ref->[4];
my $genotype = $array_ref->[9];
print $pos2 . "\t" . $ref2. "\t".$allele."\t".$genotype. "\n";
}
}
}
The script may be improved further to reduce memory use if you use huge data files.
There is not a need to keep your map_file in memory but just keys. It is good to make them keys in a hash which you use for existence checking. You don't have to keep your vcf_file in memory as well, but you can just make a decision to output or not.
#!/app/languages/perl/5.14.2/bin/perl
use strict;
use warnings;
use autodie;
use constant KEY => 1;
use constant FIELDS => ( 1, 3, 4, 9 );
my ( $map_file, $vcf_file ) = #ARGV;
my %map;
{
my $fh;
open $fh, '<', $map_file;
while (<$fh>) {
$map{ ( split /\t/, $_, KEY + 2 )[KEY] } = undef;
}
}
{
my $fh;
open $fh, '<', $vcf_file;
while (<$fh>) {
next if /^#/;
chomp;
my #data = split /\t/;
print join "\t", #data[FIELDS] if exists $map{ $data[KEY] };
}
}

To remove duplicate elements from an array in Perl

I have a data set
10-101570715-101609901-hsa-mir-3158-1 10-101600739-101609661-ENSG00000166171 10-101588288-101609668-ENSG00000166171 10-101588325-101609447-ENSG00000166171 10-101594702-101609439-ENSG00000166171 10-101570560-101596651-ENSG00000166171
10-103389007-103396515-hsa-mir-1307 10-103389041-103396023-ENSG00000173915 10-103389050-103396074-ENSG00000173915 10-103389050-103396441-ENSG00000173915 10-103389050-103396466-ENSG00000173915 10-103389050-103396466-ENSG00000173915
Except for the first element in each line, I have multiple values, which are redundant and I want to remove the redundant values. I have written a code but I don't feel its working fine.
open (fh, "file1");
while ($line=<fh>)
{
chomp ($line);
#array=$line;
my #unique = ();
my %Seen = ();
foreach my $elem ( #array )
{
next if $Seen{ $elem }++;
push #unique, $elem;
}
print #unique;
}
a hash is for duplicate detection :
my %seen;
my #removeduplicate = grep { !$seen{$_}++ } #array;
For me below code is working fine :
use strict;
use warnings;
my %seen;
open my $fh, "<", 'file.txt' or die "couldn't open : $!";
while ( my $line = <$fh>)
{
chomp $line;
my #array = split (' ', $line);
my #removeduplicate = grep { !$seen{$_}++ } #array;
print "#removeduplicate\n";
}

Compare two hashes in perl and list which records are extra?

I have two text files that contain user records. I have to compare these two files and figure out which users are missing from File1. And delete these Orphans from file2.
#!/usr/local/bin/perl -w
use strict;
use warnings;
use autodie;
use Text::Diff;
use List::Compare;
use Data::Dumper;
my $Users1 = "Users1.txt";
my $Users2 ="Users2.txt";
my %hash1;
my %hash2;
my %new_hash;
my #sorted_1;
my #sorted_2;
my #list_keys1;
my #list_keys2;
open(my $fh1, '<:encoding(UTF-8)', $Users1) or die "Colud not open the file!";
while(my $record1 = <$fh1>)
{
chomp $record1;
my #list1 = split( '/', $record1);
foreach my $item(#list1)
{
$new_hash{$list1[1]} = $list1[0];
$hash1{$list1[1]} = $list1[0];
}
while ( my ($key, $value) = each(%hash1) ) {
push (#list_keys1, $key);
#sorted_1 = sort #list_keys1;
}
}
print "\t\tHash values for USERS1:\n";
print Dumper \%hash1;
open(my $fh2, '<:encoding(UTF-8)', $Users2) or die "Colud not open the file!";
while(my $record2 = <$fh2>)
{
chomp $record2;
my #list2 = split( '/', $record2);
foreach my $item(#list2)
{
$hash2{$list2[1]} = $list2[0];
}
while ( my ($key, $value) = each(%hash2) )
{
push (#list_keys2, $key);
#sorted_2 = sort #list_keys2;
}
}
print "\n\n\t\tHash values for Users2:\n";
print Dumper \%hash2;
#hash1{#list_keys1} = 1;
#hash2{#list_keys2} = 1;
foreach(keys %hash2)
{
print "\nThis user does not exist(to be deleted): $_\n" unless exists $hash1{$_};
}
foreach (keys %hash1)
{
print "\nNew User (to be added):$_\n" unless exists $hash2{$_};
}
close ($fh1);
close ($fh2);
Questions:
I am not able to sort the user ID (String) alphabetically(here, USER IDs are random strings of length 7). Is there any limitations when it comes to sorting array/hashes in Perl?
I am not able to compare two hashes and get the differences. What would be the most efficient way to do that?
Are there any additional libraries that I need to install in order to handle this part of code?
Sample records from file:
File1:
ASIA/ASEDF46
INDIA/PSDfT5V
CHINA/FSDfT5V
INDIA/AA44TYB
USA/BBRTT67
File 2:
INDIA/PSDfT5V
CHINA/FSDfT5V
INDIA/AA44TYB
USA/BBRTT67
UK/ZK9EELO
use strict;
use warnings;
use autodie;
open my $in, '<', 'in.txt';
open my $in2, '<', 'in_2.txt';
my (%data1, %data2);
while(<$in>){
chomp;
my #split = split/\//;
$data1{$split[0]} = $split[1];
}
while(<$in2>){
chomp;
my #split = split/\//;
$data2{$split[0]} = $split[1];
}
foreach(sort keys %data1){
print "User: $_ Value: $data1{$_}\n" if $data2{$_};
}

Perl Hashes of Arrays and Some issues

I currently have a csv file that looks like this:
a,b
a,d
a,f
c,h
c,d
So I saved these into a hash such that the key "a" is an array with "b,d,f" and the key "c" is an array with "h,d"... this is what I used for that:
while(<$fh>)
{
chomp;
my #row = split /,/;
my $cat = shift #row;
$category = $cat if (!($cat eq $category)) ;
push #{$hash{$category}}, #row;
}
close($fh);
Not sure about the efficiency but it seems to work when I do a Data Dump...
Now, the issue I'm having is this; I want to create a new file for each key, and in each of those files I want to print every element in the key, as such:
file "a" would look like this:
b
d
f
<end of file>
Any ideas? Everything I've tried isn't working, I'm not too familiar / experienced with hashes...
Thanks in advance :)
The output process is very simple using the each iterator, which provides the key and value pair for the next hash element in a single call
use strict;
use warnings;
use autodie;
open my $fh, '<', 'myfile.csv';
my %data;
while (<$fh>) {
chomp;
my ($cat, $val) = split /,/;
push #{ $data{$cat} }, $val;
}
while (my ($cat, $values) = each %data) {
open my $out_fh, '>', $cat;
print $out_fh "$_\n" for #$values;
}
#!/usr/bin/perl
use strict;
use warnings;
my %foos_by_cat;
{
open(my $fh_in, '<', ...) or die $!;
while (<$fh_in>) {
chomp;
my ($cat, $foo) = split /,/;
push #{ $foos_by_cat{$cat} }, $foo;
}
}
for my $cat (keys %foos_by_cat) {
open(my $fh_out, '>', $cat) or die $!;
for my $foo (#{ $foos_by_cat{$cat} }) {
print($fh_out "$foo\n");
}
}
I wrote the inner loop as I did to show the symmetry between reading and writing, but it can also be written as follows:
print($fh_out "$_\n") for #{ $foos_by_cat{$cat} };

Trimming an array (filled via DBI)

I'm trying to read data from SQL Server database using Perl and the DBI module. My intention is to read the data and print it into a text file (comma separated). When I do this, I get the result like this:
var1,var2,var3
40406,20 ,783
50230,78 ,680
50230,78 ,680
50230,78 ,680
50230,78 ,680
So there is a whitespace between the second variable data and the comma. I tried to trim this using the code below, but it did not work. How should I modify my code to get rid of those whitespaces?
My code is here:
#!/bin/perl
use warnings;
use strict;
use DBI;
sub trim;
my $dbs = "dbi:ODBC:DRIVER={SQL Server};SERVER={xxxx}";
my ($username, $password) = ('un', 'pwd');
my $dbh = DBI->connect($dbs, $username, $password)
or die "Can't connect to $dbs: $DBI::errstr";
my $sth = $dbh->prepare("select var1, var2, var3 from db.dbo.table")
or die "Can't prepare statement: $DBI::errstr";
$sth->execute();
my $outfile = 'temp.txt';
open OUTFILE, '>', $outfile or die "Unable to open $outfile: $!";
print OUTFILE join(",", #{$sth->{NAME}}), "\n";
while (my #re = $sth->fetchrow_array) {
print OUTFILE join(",", trim(#re)), "\n";
}
close OUTFILE;
$sth->finish();
$dbh->disconnect();
############## subroutines ##################
sub trim($) {
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
Your trim() function does not modify the list in place (nor it handles a list).
So, in real TIMTOWTDI fashion, you should either modify the function to return a new array:
sub trimArray {
my #arr = #_;
my #rv;
for my $val (#arr) {
$val =~ s/^\s+//;
$val =~ s/\s+$//;
push #rv, $val;
}
return #rv;
}
#and then
print OUTFILE join(",", trimArray(#re)), "\n";
or pass a reference to your function and then modify the array in place
sub trimInPlace {
my $arrRef = shift;
for my $val (#$arrRef) {
$val =~ s/^\s+//;
$val =~ s/\s+$//;
}
}
#and then
trimInPlace(\#re); #Note the \
print OUTFILE join(",", #re), "\n";
or use map
#!/bin/perl
use warnings;
use strict;
use DBI;
#... the same
while (my #re = $sth->fetchrow_array) {
print OUTFILE join(",", map { trim($_); } #re), "\n"; #Applies
#trim() to each element
}
#...
############## subroutines ##################
sub trim { #Don't use prototypes
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
or try using chomp, by modifying $/, which will only remove a trailing space, nothing more.
#!/bin/perl
use warnings;
use strict;
use DBI;
#... the same
my $old_sep = $/;
$/ = " ";
while (my #re = $sth->fetchrow_array) {
chomp(#re); #Modifies in place, returning number of changes
print OUTFILE join(",", #re), "\n";
}
$/ = $old_sep;
You could also check to see if DBD::ODBC supports the ChopBlanks attribute:
my $dbh = DBI->connect($dbs, $username, $password, { ChopBlanks => 1 } )
the ChopBlanks attribute trims the trailing whitespace of any CHAR fields (that is if your driver supports it ... I'm not sure if DBD::ODBC does).
Why does that field have trailing whitespace? Usually that points to some sort of problem with the database model. Besides your trim() function, you might investigate why the data is dirty.

Resources