Parent Directory
|
Revision Log
Reverted to 1.8 so that gaps at start of alignment have gff coord of 1. Added notes.
#!/usr/bin/perl -w # # scan2gff.pl # Converts the output of pfoldscan.pl to gff output. # The orthology map coordinates are 0 based and half open, meaning the # rightmost coordinate is one greater than the sequence length. # (See http://hanuman.math.berkeley.edu/genomes/fileformats.html.) # The gff format is 1 based and inclusive. # (See http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml.) # Thus the conversion of coordinates is: # map:[1,3) -> gff:[2,3] # The pfoldscan column coordinates are 0 based and are created from each alignment segment # file, so they include gaps (they are not the alignment coordinates, however). # The map and gff coordinates refer to ungapped sequences. # Note: Windows that are all gaps for the reference sequence will not be output. # command-line options my $specname = "DroMel_4"; my $mapfile = "map"; my $genomefile = "genomes"; my $verbose = 0; my $debug = 0; # constants my $pfoldsuffix = ".pfoldscan"; my $alignsuffix = ".fa"; my $NA = "NA"; # text used to indicate missing sequence in mapfiles my $source = "pfold"; my $feature = "RNA"; my $usage = "Usage: $0 [options] <pfoldscan directory> <alignment directory>\n"; $usage .= " [-s <species name>] species name in alignment file (default is '$specname')\n"; $usage .= " [-m <mapfile name>] name of mapfile in alignment directory (default is '$mapfile')\n"; $usage .= " [-g <genomefile name>] name of mapfile in alignment directory (default is '$genomefile')\n"; $usage .= " [-v] verbose warnings\n"; my @argv; while (@ARGV) { my $arg = shift; if ($arg =~ /^-/) { if ($arg eq "-s") { defined ($specname = shift) or die $usage } elsif ($arg eq "-m") { defined ($mapfile = shift) or die $usage } elsif ($arg eq "-v") { $verbose = 1 } elsif ($arg eq "-d") { $debug = 1 } else { die $usage } } else { push @argv, $arg; } } die $usage unless @argv == 2; my $scandir = shift(@argv); my ($aligndir) = @argv; # get order of genomes from genome file open GENOME, "<$aligndir/$genomefile" or die "Couldn't open '$genomefile': $!"; my $species = <GENOME>; chomp $species; my @species = split /\s+/, $species; my $specnum; # the position of our species in the genomefile determines which fields to look at in the map file for ($specnum = @species - 1; $specnum >= 0 && $species[$specnum] ne $specname; --$specnum) { } close GENOME; unless ($specnum >= 0) { die "Couldn't find '$specname' in '$genomefile'\n" } # loop through mapfile open MAP, "<$aligndir/$mapfile" or die "Couldn't open '$mapfile': $!"; my $scanfiles_parsed = 0; while (<MAP>) { # read in mapfile line. The first field is line number, save in $n. my ($n, @map) = split; my $mapline = $_; # make a copy of the map line for later error reporting # make filenames my $pfoldfile = "$scandir/$n$pfoldsuffix"; my $alignfile = "$aligndir/$n$alignsuffix"; unless (-e $pfoldfile) { warn "Couldn't find scanfile '$pfoldfile'; skipping '$alignfile'\n" if $verbose; next } # read in species row from alignment file my @col; open ALIGN, "<$alignfile" or die "Couldn't open '$alignfile': $!"; my $specflag = 0; my $row = ""; my @gapSeq; my @noGapSeq; while (<ALIGN>) { if (/^\s*>(\S+)/) { my $rowname = $1; last if $specflag; # exit loop if already have found our species $specflag = 1 if $rowname eq $specname; } elsif ($specflag) { chomp; # Operates on $_ s/\s//g; $row .= $_; } } close ALIGN; unless ($specflag) { warn "Couldn't find '$specname' in '$alignfile'; skipping\n" if $verbose; next } @gapSeq = split(//,$row); # get coords for our species my ($name, $start, $end, $strand) = @map[$specnum*4..$specnum*4+3]; if ($strand eq $NA) { warn "Alignment '$alignfile' contains no sequence data for '$specname'; skipping\n" if $verbose; next } # Add one to the mapfile start coordinate since it is 0 based $start++; # Map alignment rows to sequence coords. # The coordinates given in the mapfile are relative to the positive strand. For a negative strand # map file entry, the corresponding sequence in the alignment file is the reverse complement # of the positive strand. This is why the map file start/end coordinates are reversed. # # Note: Any gaps preceding a character will have the same gff coordinate as the character. # Gaps following a character will have a gff coordinate greater by 1. # my @col2seqpos; my $step = $strand . '1'; # convert strand ("+" or "-") into integer multiplier ("+1" or "-1") my ($seqpos, $endpos) = $step > 0 ? ($start, $end) : ($end, $start); my $charcount = 0; for (my $col = 0; $col < length($row); ++$col) { push @col2seqpos, $seqpos; my $char = substr ($row, $col, 1); # Do not increment the sequence position counter if there is a gap. unless ($char eq '-' || $char eq '.') { $seqpos += $step; ++$charcount; if ($debug) { push @noGapSeq, $char; } } } if ($debug) { print "charcount=$charcount; start=$start, end=$end\n"; print "Upgapped length=" . ($#noGapSeq+1) . "; Gapped Length=" . ($#gapSeq+1) . "\n"; } $mapSeqLength = $end+1-$start; unless ($mapSeqLength == $charcount) { warn "FAILED SEQUENCE CO-ORDINATE INTEGRITY CHECK\nNumber of non-gap characters in sequence data for '$specname' in file '$alignfile' ($charcount)\n does not match number of characters implied by entry \#", $specnum+1, " in line $n of file '$aligndir/$mapfile' (", $mapSeqLength, ")\n [start=$start, end=$end, strand='$strand']\nMapfile line:\n$mapline\n"; next; } # loop through pfoldscan file open PFOLDSCAN, "<$pfoldfile" or die "Couldn't open '$pfoldfile': $!"; while (<PFOLDSCAN>) { next if /unparseable/; # Skip line if no score was returned my ($startcol, $endcol, $score, $prob) = split; # If the subsequence is all gaps, skip. my $gapSeq = join("",@gapSeq[$startcol..$endcol]); next if $gapSeq =~ /^-+$/; # Get a 2 element slice of col2seqpos my ($startpos, $endpos) = @col2seqpos[$step > 0 ? ($startcol, $endcol) : ($endcol, $startcol)]; # output a GFF line print join ("\t", $name, $source, $feature, $startpos, $endpos, $score, $strand, '.', "Probability $prob Alignment $alignfile"), "\n"; if ($debug) { print "Gapped Seq ($startcol, $endcol): @gapSeq[$startcol..$endcol]\n"; print "Ungapped seq ($startpos-$start, $endpos-$start): @noGapSeq[($startpos-$start)..($endpos-$start)]\n"; } } close PFOLDSCAN; ++$scanfiles_parsed; } close MAP; warn "Warning: didn't parse any pfoldscan files" if $verbose && $scanfiles_parsed == 0;
| Questions? Mail ihh at fruitfly dot org | ViewVC Help |
| Powered by ViewVC 1.0.3 |