[cvs] / pfold / scan2gff.pl Repository:
ViewVC logotype

View of /pfold/scan2gff.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Thu Apr 6 02:01:07 2006 UTC (4 years, 5 months ago) by ybendana
Branch: MAIN
CVS Tags: HEAD
Changes since 1.9: +13 -26 lines
Reverted to 1.8 so that gaps at start of alignment have gff coord of 1.  Added notes.
#!/usr/bin/perl -w

#
# scan2gff.pl
# Converts the output of pfoldscan.pl to gff output. 
# The orthology map coordinates are 0 based and half open, meaning the 
# rightmost coordinate is one greater than the sequence length.
#    (See http://hanuman.math.berkeley.edu/genomes/fileformats.html.)
# The gff format is 1 based and inclusive.  
#    (See http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml.)  
# Thus the conversion of coordinates is:
#    map:[1,3) -> gff:[2,3]
# The pfoldscan column coordinates are 0 based and are created from each alignment segment
#    file, so they include gaps (they are not the alignment coordinates, however).  
#    The map and gff coordinates refer to ungapped sequences.
# Note: Windows that are all gaps for the reference sequence will not be output.

# command-line options
my $specname = "DroMel_4";
my $mapfile = "map";
my $genomefile = "genomes";
my $verbose = 0;
my $debug = 0;

# constants
my $pfoldsuffix = ".pfoldscan";
my $alignsuffix = ".fa";
my $NA = "NA";   # text used to indicate missing sequence in mapfiles

my $source = "pfold";
my $feature = "RNA";

my $usage = "Usage: $0 [options] <pfoldscan directory> <alignment directory>\n";
$usage .= "      [-s <species name>] species name in alignment file (default is '$specname')\n";
$usage .= "      [-m <mapfile name>] name of mapfile in alignment directory (default is '$mapfile')\n";
$usage .= "   [-g <genomefile name>] name of mapfile in alignment directory (default is '$genomefile')\n";
$usage .= "                     [-v] verbose warnings\n";

my @argv;
while (@ARGV) {
    my $arg = shift;
    if ($arg =~ /^-/) {
	if ($arg eq "-s") { defined ($specname = shift) or die $usage }
	elsif ($arg eq "-m") { defined ($mapfile = shift) or die $usage }
	elsif ($arg eq "-v") { $verbose = 1 }
	elsif ($arg eq "-d") { $debug = 1 }
	else { die $usage }
    } else {
	push @argv, $arg;
    }
}

die $usage unless @argv == 2;
my $scandir = shift(@argv);
my ($aligndir) = @argv;

# get order of genomes from genome file
open GENOME, "<$aligndir/$genomefile" or die "Couldn't open '$genomefile': $!";
my $species = <GENOME>;
chomp $species;
my @species = split /\s+/, $species;
my $specnum;   # the position of our species in the genomefile determines which fields to look at in the map file
for ($specnum = @species - 1; $specnum >= 0 && $species[$specnum] ne $specname; --$specnum) { }
close GENOME;
unless ($specnum >= 0) { die "Couldn't find '$specname' in '$genomefile'\n" }

# loop through mapfile
open MAP, "<$aligndir/$mapfile" or die "Couldn't open '$mapfile': $!";
my $scanfiles_parsed = 0;
while (<MAP>) {
    # read in mapfile line.  The first field is line number, save in $n.
    my ($n, @map) = split;
    my $mapline = $_;   # make a copy of the map line for later error reporting

    # make filenames
    my $pfoldfile = "$scandir/$n$pfoldsuffix";
    my $alignfile = "$aligndir/$n$alignsuffix";
    unless (-e $pfoldfile) { warn "Couldn't find scanfile '$pfoldfile'; skipping '$alignfile'\n" if $verbose; next }

    # read in species row from alignment file
    my @col;
    open ALIGN, "<$alignfile" or die "Couldn't open '$alignfile': $!";
    my $specflag = 0;
    my $row = "";
    my @gapSeq;
    my @noGapSeq;
    while (<ALIGN>) {
	if (/^\s*>(\S+)/) {
	    my $rowname = $1;
	    last if $specflag; # exit loop if already have found our species
	    $specflag = 1 if $rowname eq $specname;
	} elsif ($specflag) {
	    chomp; # Operates on $_
	    s/\s//g;
	    $row .= $_;
	}
    }
    close ALIGN;
    unless ($specflag) { warn "Couldn't find '$specname' in '$alignfile'; skipping\n" if $verbose; next }
    @gapSeq = split(//,$row);

    # get coords for our species
    my ($name, $start, $end, $strand) = @map[$specnum*4..$specnum*4+3];
    if ($strand eq $NA) { warn "Alignment '$alignfile' contains no sequence data for '$specname'; skipping\n" if $verbose; next }

    # Add one to the mapfile start coordinate since it is 0 based
    $start++;

    # Map alignment rows to sequence coords.
    # The coordinates given in the mapfile are relative to the positive strand. For a negative strand 
    # map file entry, the corresponding sequence in the alignment file is the reverse complement
    # of the positive strand.  This is why the map file start/end coordinates are reversed.
    #
    # Note: Any gaps preceding a character will have the same gff coordinate as the character.
    #       Gaps following a character will have a gff coordinate greater by 1.
    #
    my @col2seqpos;
    my $step = $strand . '1';  # convert strand ("+" or "-") into integer multiplier ("+1" or "-1")
    my ($seqpos, $endpos) = $step > 0 ? ($start, $end) : ($end, $start);
    my $charcount = 0;
    for (my $col = 0; $col < length($row); ++$col) {
	push @col2seqpos, $seqpos;
	my $char = substr ($row, $col, 1);
	# Do not increment the sequence position counter if there is a gap.
	unless ($char eq '-' || $char eq '.') {
	    $seqpos += $step;
	    ++$charcount;
	    if ($debug)
	    {
	      push @noGapSeq, $char;
	    }
	}
    }
    if ($debug)
    {
      print "charcount=$charcount; start=$start, end=$end\n";
      print "Upgapped length=" . ($#noGapSeq+1) . "; Gapped Length=" . ($#gapSeq+1) . "\n";
    }
    $mapSeqLength = $end+1-$start;
    unless ($mapSeqLength == $charcount) {
	warn "FAILED SEQUENCE CO-ORDINATE INTEGRITY CHECK\nNumber of non-gap characters in sequence data for '$specname' in file '$alignfile' ($charcount)\n does not match number of characters implied by entry \#", $specnum+1, " in line $n of file '$aligndir/$mapfile' (", $mapSeqLength, ")\n [start=$start, end=$end, strand='$strand']\nMapfile line:\n$mapline\n";
	next;
    }

    # loop through pfoldscan file
    open PFOLDSCAN, "<$pfoldfile" or die "Couldn't open '$pfoldfile': $!";
    while (<PFOLDSCAN>) {
	next if /unparseable/; # Skip line if no score was returned
	my ($startcol, $endcol, $score, $prob) = split;
	# If the subsequence is all gaps, skip.
	my $gapSeq = join("",@gapSeq[$startcol..$endcol]);
	next if $gapSeq =~ /^-+$/;
	# Get a 2 element slice of col2seqpos
	my ($startpos, $endpos) = @col2seqpos[$step > 0 ? ($startcol, $endcol) : ($endcol, $startcol)];

	# output a GFF line
	print join ("\t", $name, $source, $feature, $startpos, $endpos, $score, $strand, '.', "Probability $prob Alignment $alignfile"), "\n";
	if ($debug)
	{
	  print "Gapped Seq ($startcol, $endcol): @gapSeq[$startcol..$endcol]\n";
	  print "Ungapped seq ($startpos-$start, $endpos-$start): @noGapSeq[($startpos-$start)..($endpos-$start)]\n";
	}
    }
    close PFOLDSCAN;
    ++$scanfiles_parsed;
}
close MAP;

warn "Warning: didn't parse any pfoldscan files"  if $verbose && $scanfiles_parsed == 0;

Questions? Mail ihh at fruitfly dot org
ViewVC Help
Powered by ViewVC 1.0.3