print specific word staring with in text and count - arrays

I like to find word start with sid=word and sid=text and print and count it the same word.
sid=word 2
sid=text 5
I have try make some script
use warnings;
use strict;
my $input = 'input.txt';
my $output = 'output.txt';
open (FILE, "<", $input) or die "Can not open $input $!";
open my $out, '>', $output or die "Can not open $output $!";
while (<FILE>){
foreach my #arr = /(?: ^|\s )(sid=\S*) {
$count{$arr}++;
}
}
foreach my #arr (sort keys %count){
printf "%-31s %s\n", $str, $count{$arr};
}
but show error missing $ on loop variable
anyone can help me out what i miss.
thanks.

This should produce desired output to output.txt, with words in order of appearance
use warnings;
use strict;
my $input = 'input.txt';
my $output = 'output.txt';
open (my $FILE, "<", $input) or die "Can not open $input $!";
open (my $out, ">", $output) or die "Can not open $output $!";
my (%count, #arr);
while (<$FILE>){
if ( /(?: ^|\s )(sid=\S*)/x ) {
push #arr, $1 if !$count{$1};
$count{$1}++;
}
}
foreach my $str (#arr) {
print $out sprintf("%-31s %s\n", $str, $count{$str});
}

Related

Definition error when using columns in one file to find matching columns in another file with perl

I have a tab delimited input file in the format:
+ Chr1 www
- Chr2 zzz
...
I would like to go line by line against a reference tab delimited file (TRANSCRIPTS in the code below) in the format of:
Chr1 + xxx UsefulInfo1
Chr2 - yyy UsefulInfo2
...
And would like an output that looks like:
+ Chr1 UsefulInfo1
- Chr2 UsefulInfo2
...
Here is my attempt to take variable names from the command line, grab certain info from the input file, and append the useful info from the reference file:
#!/usr/bin/perl
use strict;
use warnings;
use diagnostics;
my $inFile = $ARGV[0];
my $outFile = $ARGV[1];
open(INFILE, "<$inFile") || die("Couldn't open $inFile: $!\n");
open(OUTFILE, ">$outFile") || die("Couldn't create $outFile: $!\n");
open(TRANSCRIPTS, "</path/TranscriptInfo") || die("Couldn't open reference file!");
my #transcripts = split(/\t+/, <TRANSCRIPTS>);
chomp #transcripts;
#Define desired information from input for later
while (my #columns = split(/\t+/, <INFILE>)) {
chomp #columns;
my $strand = $columns[0];
my $chromosome = $columns[1];
#Attempt to search reference file line by line for matching criteria and copying a column of matching lines
foreach my $reference(#transcripts) {
my $refChr = $reference[0]; #Error for this line
my $refStrand = $reference[1]; #Error for this line
if ($refChr eq $chromosome && $refStrand eq $strand) {
my $info = $reference[3]; #Error for this line
print OUTFILE "$strand\t$chromosome\t\$info\n";
}
}
}
close(OUTFILE); close(INFILE);
At the moment I receive "Global symbol "#reference" requires explicit package name." What is the proper way to define this? I'm not even entirely sure my foreach loop functions as desired even once defining the symbol properly.
Fixed:
use strict;
use warnings;
use feature qw( say );
my $in_qfn = $ARGV[0];
my $out_qfn = $ARGV[1];
my $transcripts_qfn = "/path/TranscriptInfo";
my #transcripts;
{
open(my $transcripts_fh, "<", $transcripts_qfn)
or die("Can't open \"$transcripts_qfn\": $!\n");
while (<$transcripts_fh>) {
chomp;
push #transcripts, [ split(/\t/, $_, -1) ];
}
}
{
open(my $in_fh, "<", $in_qfn)
or die("Can't open \"$in_qfn\": $!\n");
open(my $out_fh, ">", $out_qfn)
or die("Can't create \"$out_qfn\": $!\n");
while (<$in_fh>) {
chomp;
my ($strand, $chr) = split(/\t/, $_, -1);
for my $transcript (#transcripts) {
my $ref_chr = $transcript->[0];
my $ref_strand = $transcript->[1];
if ($chr eq $ref_chr && $strand eq $ref_strand) {
my $info = $transcript->[2];
say $out_fh join("\t", $strand, $chr, $info);
}
}
}
}
That said, the above is very inefficient. Let's call N the number of lines in $transcript_qfn, and let's call M the number of lines in $in_qfn. The inner loop executes a number of times equal to N*M. In fact, it needs only execute N times.
use strict;
use warnings;
use feature qw( say );
my $in_qfn = $ARGV[0];
my $out_qfn = $ARGV[1];
my $transcripts_qfn = "/path/TranscriptInfo";
my %to_print;
{
open(my $in_fh, "<", $in_qfn)
or die("Can't open \"$in_qfn\": $!\n");
while (<$in_fh>) {
chomp;
my ($strand, $chr) = split(/\t/, $_, -1);
++$to_print{$strand}{$chr};
}
}
{
open(my $transcript_fh, "<", $transcript_qfn)
or die("Can't open \"$transcript_qfn\": $!\n");
open(my $out_fh, ">", $out_qfn)
or die("Can't create \"$out_qfn\": $!\n");
while (<$transcript_fh>) {
chomp;
my ($ref_chr, $ref_strand, $info) = split(/\t/, $_, -1);
next if !$to_print{$ref_strand};
next if !$to_print{$ref_strand}{$ref_chr};
say $out_fh join("\t", $ref_strand, $ref_chr, $info);
}
}

Perl array overriding

I am trying to do sum of the array in my perl code but I was not able to get the right output.
Here is my sample code
use File::Find::Rule;
use Date::Parse;
my ($dir, $type, $fh, $line, $str_1,
$str_2,$str_3, $str_4);
my #array;
$dir = '/dir/test/';
$type = '*';
$str_1 = 'somestr1';
$str_2 = 'somestr2';
$str_3 = 'somestr3';
$str_4 = 'somestr4';
my #files = File::Find::Rule->file()->name($type)->in($dir);
open $out, '>>', "output_log" or die "Unable to open 'output_log' : $!";
print $out "\Logs \n";
print $out "--------------------------\n";
close $out or die "Unable to finish writing output_log : $!";
for my $file (#files) {
open $fh, '<', $file or die "can't open $file: $!";
open $out, '>>', "output_log" or die "Unable to open 'output_log' : $!";
while ( $line = <$fh> ) {
if ( $line !~ /$str_1/ && $line =~ /$str_2/ )
{
#array = $somevar # result of this loop 2,3
}
if ( $line !~ /$str_3/ && $line =~ /$str_4/ )
{
#array = $somevar #result of this loop 2,3,4,5,6
}
}
close $out or die "Unable to finish writing output_log : $!";
}
So Here is what I want
#array = (2,3,2,3,4,5,6)
and sum of #array
but unfornately,if i print this array that is running each and every line but instead i want to write a loop which stores the result in array from both if blocks..
now the code is overriding the #array in the second if block. Hope I made this clear!! Please help
#array = ... overwrites the contents of the array. Use push to add elements to an existing array.
For sum, see sum (or sum0) in List::Util.

How to count the number of keys that exist in a hash?

I am working with an input file that contains tab delimitated sequences. Groups of sequences are separated by line breaks. The file looks like:
TAGC TAGC TAGC HELP
TAGC TAGC TAGC
TAGC HELP
TAGC
Here is the code I have:
use strict;
use warnings;
open(INFILE, "<", "/path/to/infile.txt") or die $!;
my %hash = (
TAGC => 'THIS_EXISTS',
GCTA => 'THIS_DOESNT_EXIST',
);
while (my $line = <INFILE>){
chomp $line;
my $hash;
my #elements = split "\t", $line;
open my $out, '>', "/path/to/outfile.txt" or die $!;
foreach my $sequence(#elements){
if (exists $hash{$sequence}){
print $out ">$sequence\n$hash{$sequence}\n";
}
else
}
$count++;
print "Doesn't exist ", $count, "\n";
}
}
}
How can I tell how many sequences exist before I print? I need to put that information into the name of the output file.
Ideally, I would have a variable that I could include in the name of the file. Unfortunately, I can't just take the scalar of #elements because there are some sequences that won't get printed out. When I try to push the keys that exist into an array and then print the scalar of that array, I still don't get the results I need. Here is what I tried (all variables that need to be global are):
open my $out, '>', "/path/to/file.$number.txt" or die $!;
foreach my $sequence(#elements){
if (exists $hash{$sequence}){
push(#Array, $hash{$sequence}, "\n");
my $number = #Array;
print $out ">$sequence\n$hash{$sequence}\n";
#....
Thanks for the help. Really appreciate it.
my $sequences = grep exists $hash{$_}, #elements;
open my $out, '>', "/path/to/outfile_containing_$sequences.txt" or die $!;
In list context, grep filters a list by a criterion; in scalar context, it returns a count of elements that met the criterion.
The easiest way would be to keep track of how many keys you are printing in a variable and once your loop finish, just rename the file with the number you calculated. Perl comes with a built-in function to do this. The code would be something like this:
use strict;
use warnings;
open(INFILE, "<", "/path/to/infile.txt") or die $!;
my %hash = (
TAGC => 'THIS_EXISTS',
GCTA => 'THIS_DOESNT_EXIST',
);
my $ammt;
while (my $line = <INFILE>){
chomp $line;
my $hash;
my #elements = split "\t", $line;
open my $out, '>', "/path/to/outfile.txt" or die $!;
foreach my $sequence(#elements){
if (exists $hash{$sequence}){
print $out ">$sequence\n$hash{$sequence}\n";
$ammt++;
}
else
}
print "Doesn't exist ", $count, "\n";
}
}
}
rename "/path/to/outfile.txt", "/path/to/outfile${ammt}.txt" or die $!;
I removed the $count variable, since it's not declared in your code (strict would complain about that). Here's the official doc for rename. Since it returns True or False, you can check that it was successful or not.
By the way, be aware that:
push(#Array, $hash{$sequence}, "\n");
is storing two items ($hash{$sequence} and \n), so that count would be twice as it should be.

Simple Perl: can't compare array elements to strings

I initially wrote a script to convert a wordlist(each line is just 1 word) into an array of words #keywords(each line an element) using:
-------------------
open (FH, "< $keyword_file") or die "Can't open $keyword_file for read: $!";
my #keywords;
while (<FH>) {
push (#keywords, $_);
}
close FH or die "Cannot close $keyword_file: $!";
--------------------
I am now trying to use regex to compare this with other strings, but i just keep getting false results for some reason?
-----------------------
FULL PROGRAM
-----------------------------------
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
print "\n[Keywords]";
my $keyword_file = "keywords.txt";
#read keywords
my #keywords;
open (FH, "$keyword_file") or die "Can't open $keyword_file for read: $!";
while (<FH>) {
chomp;
push (#keywords, $_);
}
close FH or die "Cannot close $keyword_file: $!";
#pattern match
foreach(#keywords)
{
if ("print" =~ m/$_/) {
print "match found\n";
}
}
----------------------------------
the above argument is supposed to be true but it just keeps returning false. What am i doing wrong? Is this because the array is storing nextlines(enter) as well (sorry if i sound ignorant for thinking this :p)?
Yes, you need to chomp your array to remove line endings:
chomp(#keywords);
You forget to remove the newline in your keywords, try this:
while (<FH>) {
chomp;
push (#keywords, $_);
}
Here is a test program:
#!/usr/bin/perl
use strict;
use warnings;
my #keyword;
while (<DATA>) {
chomp;
push #keyword, $_;
}
foreach (#keyword) {
if ("print exit write" =~ m/$_/) {
print "match found\n";
}
}
__DATA__
print
exit
write

Create CSV file from 2d array perl

I am trying to load a csv file, transpose it and write a new one. I have everything working correctly except writing a new file. I have looked around online without success.
use strict;
use warnings;
use Text::CSV;
use Data::Dump qw(dump);
use Array::Transpose;
my #data; # 2D array for CSV data
my $file = 'sample_array.csv';
my $csv = Text::CSV->new;
open my $fh, '<', $file or die "Could not open $file: $!";
while( my $row = $csv->getline( $fh ) ) {
shift #$row; # throw away first value
push #data, $row;
}
#data=transpose(\#data);
dump(#data);
The output here is the transposed array #data (["blah", 23, 22, 43], ["tk1", 1, 11, 15],["huh", 5, 55, 55]). I need that output to be written to a new CSV file.
CSV file:
text,blah,tkl,huh
14,23,1,5
12,22,11,55
23,42,15,55
Refer to the code after dump. This was derived from the Text::CSV SYNOPSIS:
use strict;
use warnings;
use Text::CSV;
use Data::Dump qw(dump);
use Array::Transpose;
my #data; # 2D array for CSV data
my $file = 'sample_array.csv';
my $csv = Text::CSV->new;
open my $fh, '<', $file or die "Could not open $file: $!";
while( my $row = $csv->getline( $fh ) ) {
shift #$row; # throw away first value
push #data, $row;
}
#data=transpose(\#data);
dump(#data);
open $fh, ">:encoding(utf8)", "new.csv" or die "new.csv: $!";
for (#data) {
$csv->print($fh, $_);
print $fh "\n";
}
close $fh or die "new.csv: $!";
Along with Toolic's addition I had to make some edits due to the specific type of data I was dealing with. This was an extremely large set with engineering symbols & units and negative numbers with long decimals. For reference, my final code is below.
use strict;
use warnings;
use Text::CSV;
use Data::Dump qw(dump);
use Array::Transpose;
my #data; # 2D array for CSV data
my $file = 'rawdata.csv';
my $csv = Text::CSV->new({ binary => 1, quote_null => 0 });
open my $fh, '<', $file or die "Could not open $file: $!";
while( my $row = $csv->getline( $fh ) ) {
#shift #$row; # throw away first value, I needed the first values.
push #data, $row;
}
#data=transpose(\#data);
open $fh, ">:encoding(utf8)", "rawdata_trans.csv" or die "rawdata_trans.csv: $!";
for (#data) {
$csv->print($fh, $_);
print $fh "\n";
}
close $fh or die "rawdata_trans.csv: $!";

Resources