I need help figuring out how to accomodate situations in which $hash {$i} is loaded with #headers array of different sizes.
use strict;
use warnings;
my $file = "list.csv";
open (FILE,"$file") || die "Can't open file: $!\n";
my (#lines) = <FILE>;
close(FILE);
my #headers = split(',',$lines[0]);#split up header line
my %hash;
for (my $i=1; $i < scalar(#lines); $i++)
{
my #strings = split(',',$lines[$i];
# NEED help here
$hash{$i} = {
$headers[0] => $strings[0],
$headers[1] => $strings[0],
$headers[2] => $strings[0],
$headers[3] => $strings[0],
$headers[4] => $strings[0],
$headers[5] => $strings[0]
};
}
Is there a way to load up hash at index for in situations when scalar(#headers)=5,6,7 ... etc? Is there a programatic equivalent for something like...
$hash{$i} = {
$headers[0] => $strings[0],
...
$headers[n] => $strings[n]
};
or
$hash{$i} = {#headers => #strings);
The idiom you want is:
#{ $hash{$i} }{ #headers } = #strings;
This is known as slicing.
Given that you're reading CSV data you might look at some of the CPAN modules out there for this sort of thing, Text::CSV, for example.
TIMTOWTDI
#!/usr/bin/perl
use strict;
use warnings;
my $file = "list.csv";
# Use lexical filehandles, not globals; use 3-arg open; don't quote filename
open ( my $fh, '<', $file ) or die "Can't open file: $!\n";
my( #lines ) = <$fh>;
close( $fh );
# split takes a regex; also, notice the shift
my #headers = split( /,/, shift #lines );
my %hash;
# Use perly for loops here
foreach my $i ( 0..$#lines )
# This works, too
#for my $i ( 0..$#lines )
{
# split takes a regex
my #strings = split( /,/, $lines[$i] );
# One way (probably best)
#{ $hash{$i} }{ #headers } = #strings;
# Another way
#$hash{$i} = { map { $headers[$_] => $strings[$_] } ( 0 .. $#strings ) };
# Another way
#$hash{$i}{ $headers[$_] } = $strings[$_] = for(0..$#strings);
}
#use Data::Dumper;
#print Dumper \%hash;
But yes, using Text::CSV (or the faster Text::CSV_XS) would be even better than trying to manually split the CSV yourself (what happens if there are spaces? what happens if the fields and/or headers are quoted? It's a solved problem.)
Related
I have a Problem about the Foreach Loop and Splitting with Perl.
I want to loop the Arrays and split it to name and value.
1.) I read the File and save to an String.
2.) I split the empty line and save to an Array.
My Script:
#!/usr/bin/perl
use strict;
my $pathconfigfile = 'config.conf';
my #configline;
open(my $configfile, "<", $pathconfigfile);
local $/;
my #configdata = split("\n\n",<$configfile>);
#print $configdata[0], "\n";
#print $configdata[1], "\n";
#print $configdata[2], "\n";
foreach my $data (#configdata){
my #editing = split /#/, $data;
my ($name, $value) = #editing[0,1];
print $name "\n";
print $value "\n";
}
close $configfile;
Configfile:
Testingtttttttttttttttttttttttt
############################################
0987654345678909876MN09uz6t56789oiuhgölkjhgfr
0987654323456789098765fgnloiuztlkjhgfrtzuiknb
MegaMixoiuzt
############################################
09876543457890098765NSUDlkjhzgtfr67899ztz9098
098765435678987t87656789876567898765679097658
TESTINGPARTS
############################################
0987654567890098765hzzasza654567uhgdjdjfacdaa
9876545678987654mchfuiaq754567898765434567876
My wish result:
$name = Testingtttttttttttttttttttttttt
$value = 0987654345678909876MN09uz6t56789oiuhgölkjhgfr
0987654323456789098765fgnloiuztlkjhgfrtzuiknb
$name = MegaMixoiuzt
$value = 09876543457890098765NSUDlkjhzgtfr67899ztz9098
098765435678987t8765678987656789876567909765
$name = TESTINGPARTS
$value = 0987654567890098765hzzasza654567uhgdjdjfacdaa
9876545678987654mchfuiaq754567898765434567876
split /#/ # A "#" separates the two.
should be
split /\n#+\n/ # A line of "#" separates the two.
With other improvements:
#!/usr/bin/perl
use strict;
use warnings;
my $config_qfn = 'config.conf';
open(my $config_fh, "<", $config_qfn )
or die("Can't open \"$config_qfn\": $!\n");
local $/ = ""; # Paragraph mode
while (my $rec = <$config_fh>) {
my ($name, $value) = split(/\n#+\n/, $rec);
print "\$name = $name\n";
print "\$value = $value\n";
}
Also something like this without foreach:
use strict;
use warnings;
open my $fh, '<config.conf' or die "$!"; my $data = join '', <$fh>; close $fh;
my %hash = $data =~ /^(.+)\n#+\n(\S+\n\S+)/mg;
print "NAME: $_\nVALUE: $hash{$_}\n\n" for keys %hash
You want to split by multiple # so use #+
+ match one or more times.
Try it
#!/usr/bin/perl
use strict;
my $pathconfigfile = 'config.conf';
my #configline;
open(my $configfile, "<", $pathconfigfile);
local $/;
my #configdata = split("\n\n",<$configfile>);
foreach my $data (#configdata){
my ($name,$value) = split /\n#+\n/, $data;
print "$name $value\n\n";
}
I am trying to findy the common lines among two tab separated files based one field.
One line of the first file:
1 52854 s64199.1 A . . . PR GT 0/0
One line of the second file:
chr1 52854 . C T 215.302 . AB=0.692308;ABP=7.18621;AC=1;AF=0.5;AN=2;AO=9;CIGAR=1X;DP=13;DPB=13;DPRA=0;EPP=3.25157;EPPR=3.0103;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=1;NUMALT=1;ODDS=17.5429;PAIRED=0;PAIREDR=0.25;PAO=0;PQA=0;PQR=0;PRO=0;QA=318;QR=138;RO=4;RPP=3.25157;RPPR=5.18177;RUN=1;SAF=0;SAP=22.5536;SAR=9;SRF=1;SRP=5.18177;SRR=3;TYPE=snp;technology.illumina=1;BVAR GT:DP:RO:QR:AO:QA:GL 0/1:13:4:138:9:318:-5,0,-5
Based on the second field (52854) in this example I have a much.
Here is my code which finds the common ones, but my files are quite large and takes a lot of time.
Is there any way to speed up the process?
Thank you very much in advance.
#!/app/languages/perl/5.14.2/bin/perl
use strict;
use warnings;
my $map_file = $ARGV[0];
my $vcf_file = $ARGV[1];
open my $map_info, $map_file or die "Could not open $map_file: $!";
my #map_array = ();
my #vcf_array = ();
while( my $mline = <$map_info>) {
chomp $mline;
my #data1 = split('\t', $mline);
my $pos1 = $data1[1];
push (#map_array, $pos1);
}
open my $vcf_info, $vcf_file or die "Could not open $vcf_file: $!";
while( my $line = <$vcf_info>) {
if ($line !~ m/^#/) {
push (#vcf_array, $line);
}
}
foreach my $a (#map_array) {
chomp $a;
foreach my $b (#vcf_array) {
chomp $b;
my #data = split('\t', $b);
my $pos2 = $data[1];
my $ref2 = $data[3];
my $allele = $data[4];
my $genotype = $data[9];
if ($a == $pos2) {
print $pos2 . "\t" . $ref2. "\t".$allele."\t".$genotype. "\n";
#print "$b\n";
}
}
}
Here's a version that should run much faster than your own
It reads the map file and stores each pos field in hash %wanted. Then it reads through the second file and checks whether the record is in the list of wanted values. If so then it splits the record and prints the fields you require
Note that I haven't been able to test this beyond making sure that it compiles
use strict;
use warnings;
use 5.010;
use autodie;
my ( $map_file, $vcf_file ) = #ARGV;
my %wanted;
{
open my $map_fh, '<', $map_file;
while ( <$map_fh> ) {
chomp;
my $pos = ( split /\t/, $_, 3 )[1];
++$wanted{$pos};
}
}
{
open my $vcf_fh, '<', $vcf_file;
while ( <$vcf_fh> ) {
next if /^#/;
chomp;
my $pos = ( split /\t/, $_, 3 )[1];
next unless $wanted{$pos};
my ( $ref, $allele, $genotype ) = ( split /\t/ )[3, 4, 9];
print join("\t", $pos, $ref, $allele, $genotype), "\n";
}
}
Below please find minimal modification of your script for hash based searches
use strict;
use warnings;
my $map_file = $ARGV[0];
my $vcf_file = $ARGV[1];
my %vcf_hash;
open( my $vcf_info, $vcf_file) or die "Could not open $vcf_file: $!";
while( my $line = <$vcf_info>) {
next if $line =~ m/^#/; # Skip comment lines
chomp $line;
my (#data) = split(/\t/, $line);
die unless #data >= 10; # Check number of fields in the input line
my ($pos) = $data[1];
# $. - line number in the file
$vcf_hash{$pos}{$.} = \#data;
}
open( my $map_info, $map_file) or die "Could not open $map_file: $!";
while( my $mline = <$map_info>) {
chomp $mline;
my (#data) = split(/\t/, $mline);
die unless #data >= 2; # Check number of fields in the input line
my ($pos) = $data[1];
if( exists $vcf_hash{$pos}) {
my $hash_ref = $vcf_hash{$pos};
for my $n (sort{$a<=>$b} keys %$hash_ref) {
my $array_ref = $hash_ref->{$n};
my $pos2 = $array_ref->[1];
my $ref2 = $array_ref->[3];
my $allele = $array_ref->[4];
my $genotype = $array_ref->[9];
print $pos2 . "\t" . $ref2. "\t".$allele."\t".$genotype. "\n";
}
}
}
The script may be improved further to reduce memory use if you use huge data files.
There is not a need to keep your map_file in memory but just keys. It is good to make them keys in a hash which you use for existence checking. You don't have to keep your vcf_file in memory as well, but you can just make a decision to output or not.
#!/app/languages/perl/5.14.2/bin/perl
use strict;
use warnings;
use autodie;
use constant KEY => 1;
use constant FIELDS => ( 1, 3, 4, 9 );
my ( $map_file, $vcf_file ) = #ARGV;
my %map;
{
my $fh;
open $fh, '<', $map_file;
while (<$fh>) {
$map{ ( split /\t/, $_, KEY + 2 )[KEY] } = undef;
}
}
{
my $fh;
open $fh, '<', $vcf_file;
while (<$fh>) {
next if /^#/;
chomp;
my #data = split /\t/;
print join "\t", #data[FIELDS] if exists $map{ $data[KEY] };
}
}
I have a data set
10-101570715-101609901-hsa-mir-3158-1 10-101600739-101609661-ENSG00000166171 10-101588288-101609668-ENSG00000166171 10-101588325-101609447-ENSG00000166171 10-101594702-101609439-ENSG00000166171 10-101570560-101596651-ENSG00000166171
10-103389007-103396515-hsa-mir-1307 10-103389041-103396023-ENSG00000173915 10-103389050-103396074-ENSG00000173915 10-103389050-103396441-ENSG00000173915 10-103389050-103396466-ENSG00000173915 10-103389050-103396466-ENSG00000173915
Except for the first element in each line, I have multiple values, which are redundant and I want to remove the redundant values. I have written a code but I don't feel its working fine.
open (fh, "file1");
while ($line=<fh>)
{
chomp ($line);
#array=$line;
my #unique = ();
my %Seen = ();
foreach my $elem ( #array )
{
next if $Seen{ $elem }++;
push #unique, $elem;
}
print #unique;
}
a hash is for duplicate detection :
my %seen;
my #removeduplicate = grep { !$seen{$_}++ } #array;
For me below code is working fine :
use strict;
use warnings;
my %seen;
open my $fh, "<", 'file.txt' or die "couldn't open : $!";
while ( my $line = <$fh>)
{
chomp $line;
my #array = split (' ', $line);
my #removeduplicate = grep { !$seen{$_}++ } #array;
print "#removeduplicate\n";
}
I have two text files that contain user records. I have to compare these two files and figure out which users are missing from File1. And delete these Orphans from file2.
#!/usr/local/bin/perl -w
use strict;
use warnings;
use autodie;
use Text::Diff;
use List::Compare;
use Data::Dumper;
my $Users1 = "Users1.txt";
my $Users2 ="Users2.txt";
my %hash1;
my %hash2;
my %new_hash;
my #sorted_1;
my #sorted_2;
my #list_keys1;
my #list_keys2;
open(my $fh1, '<:encoding(UTF-8)', $Users1) or die "Colud not open the file!";
while(my $record1 = <$fh1>)
{
chomp $record1;
my #list1 = split( '/', $record1);
foreach my $item(#list1)
{
$new_hash{$list1[1]} = $list1[0];
$hash1{$list1[1]} = $list1[0];
}
while ( my ($key, $value) = each(%hash1) ) {
push (#list_keys1, $key);
#sorted_1 = sort #list_keys1;
}
}
print "\t\tHash values for USERS1:\n";
print Dumper \%hash1;
open(my $fh2, '<:encoding(UTF-8)', $Users2) or die "Colud not open the file!";
while(my $record2 = <$fh2>)
{
chomp $record2;
my #list2 = split( '/', $record2);
foreach my $item(#list2)
{
$hash2{$list2[1]} = $list2[0];
}
while ( my ($key, $value) = each(%hash2) )
{
push (#list_keys2, $key);
#sorted_2 = sort #list_keys2;
}
}
print "\n\n\t\tHash values for Users2:\n";
print Dumper \%hash2;
#hash1{#list_keys1} = 1;
#hash2{#list_keys2} = 1;
foreach(keys %hash2)
{
print "\nThis user does not exist(to be deleted): $_\n" unless exists $hash1{$_};
}
foreach (keys %hash1)
{
print "\nNew User (to be added):$_\n" unless exists $hash2{$_};
}
close ($fh1);
close ($fh2);
Questions:
I am not able to sort the user ID (String) alphabetically(here, USER IDs are random strings of length 7). Is there any limitations when it comes to sorting array/hashes in Perl?
I am not able to compare two hashes and get the differences. What would be the most efficient way to do that?
Are there any additional libraries that I need to install in order to handle this part of code?
Sample records from file:
File1:
ASIA/ASEDF46
INDIA/PSDfT5V
CHINA/FSDfT5V
INDIA/AA44TYB
USA/BBRTT67
File 2:
INDIA/PSDfT5V
CHINA/FSDfT5V
INDIA/AA44TYB
USA/BBRTT67
UK/ZK9EELO
use strict;
use warnings;
use autodie;
open my $in, '<', 'in.txt';
open my $in2, '<', 'in_2.txt';
my (%data1, %data2);
while(<$in>){
chomp;
my #split = split/\//;
$data1{$split[0]} = $split[1];
}
while(<$in2>){
chomp;
my #split = split/\//;
$data2{$split[0]} = $split[1];
}
foreach(sort keys %data1){
print "User: $_ Value: $data1{$_}\n" if $data2{$_};
}
I currently have a csv file that looks like this:
a,b
a,d
a,f
c,h
c,d
So I saved these into a hash such that the key "a" is an array with "b,d,f" and the key "c" is an array with "h,d"... this is what I used for that:
while(<$fh>)
{
chomp;
my #row = split /,/;
my $cat = shift #row;
$category = $cat if (!($cat eq $category)) ;
push #{$hash{$category}}, #row;
}
close($fh);
Not sure about the efficiency but it seems to work when I do a Data Dump...
Now, the issue I'm having is this; I want to create a new file for each key, and in each of those files I want to print every element in the key, as such:
file "a" would look like this:
b
d
f
<end of file>
Any ideas? Everything I've tried isn't working, I'm not too familiar / experienced with hashes...
Thanks in advance :)
The output process is very simple using the each iterator, which provides the key and value pair for the next hash element in a single call
use strict;
use warnings;
use autodie;
open my $fh, '<', 'myfile.csv';
my %data;
while (<$fh>) {
chomp;
my ($cat, $val) = split /,/;
push #{ $data{$cat} }, $val;
}
while (my ($cat, $values) = each %data) {
open my $out_fh, '>', $cat;
print $out_fh "$_\n" for #$values;
}
#!/usr/bin/perl
use strict;
use warnings;
my %foos_by_cat;
{
open(my $fh_in, '<', ...) or die $!;
while (<$fh_in>) {
chomp;
my ($cat, $foo) = split /,/;
push #{ $foos_by_cat{$cat} }, $foo;
}
}
for my $cat (keys %foos_by_cat) {
open(my $fh_out, '>', $cat) or die $!;
for my $foo (#{ $foos_by_cat{$cat} }) {
print($fh_out "$foo\n");
}
}
I wrote the inner loop as I did to show the symmetry between reading and writing, but it can also be written as follows:
print($fh_out "$_\n") for #{ $foos_by_cat{$cat} };