#!/usr/local/bin/perl
# doodle2gff
# this script converts .ptt.doodle files to GFF format
#
# Authors: Xuemei Tang <txmei@hotmail.com>
# Leonardo Marino-Ramirez <marino@tofu.tamu.edu>
#
# Please cite the authors in any work or product based on this material.
#
use strict;
use Getopt::Std;
use File::Basename;
use vars qw($file @start @end @strand @length @pid @gene @synonym @spid
@cog @product $size $infile $length $organism
$opt_i $opt_l $opt_o $opt_h
);
## Check command line
my $prog = basename($0);
getopt('hilo');
if ($opt_h) {
usage($prog); exit();
} if ($opt_i) {
$infile = "$opt_i";
} if ($opt_l) {
# include genome size here
$length = "$opt_l";
} if ($opt_o) {
$organism = "$opt_o";
} else {
usage($prog); exit();
}
# get input file
get_input();
# get output file
get_output();
# get input file function
sub get_input{
my($i) = 0;
my @line;
my $line;
# open the file
open(IN, $infile) || die "can't open input: $!";
# read the line by the end of the file
while($line = <IN>)
{
# split the line by the white space
($pid[$i], $gene[$i], $start[$i], $end[$i], $strand[$i], $length[$i], $synonym[$i], $spid[$i], $product[$i], $cog[$i]) = split("\t", $line);
$i++;
}
# close the file
close(IN);
# get the number of the element for the array location
$size = @pid;
for(my $j = 0; $j < $size; $j++)
{
# get rid of " "
$strand[$j] =~ s/ //;
# replace ";" with ","
$product[$j] =~ s/\;/\,/;
# Remove newline
chomp $cog[$j];
#tidy up null spids
if ($spid[$j] eq "") {
$spid[$j] = "-";
}
}
}
# get output file function
sub get_output{
my($i);
# write the result to the output file
open(OUT, ">doodle.gff");
printf OUT ("%s\tchromosome\tComponent\t1\t%d\t.\t.\t.\tSequence \"%s\"\n", $organism, $length, $organism);
for($i = 0; $i < $size; $i++)
{
printf OUT ("$organism\tdoodle\tgene\t%d\t%d\t.\t%s\t.\tGene \"%s\" ; Note \"%s\"\n", $start[$i], $end[$i], $strand[$i], $gene[$i], $product[$i]);
printf OUT ("$organism\tdoodle\tORF\t%d\t%d\t.\t%s\t.\tORF \"%s\" ; Note \"%s\\; %s\"\n", $start[$i], $end[$i], $strand[$i], $synonym[$i], $gene[$i], $product[$i]);
printf OUT ("$organism\tdoodle\tSwissProt\t%d\t%d\t.\t%s\t.\tSwissProt \"%s\" ; Note \"%s\\; %s\"\n", $start[$i], $end[$i], $strand[$i], $spid[$i], $gene[$i], $product[$i]);
printf OUT ("$organism\tdoodle\tCOG\t%d\t%d\t.\t%s\t.\tCOG \"%s\" ; Note \"%s\\; %s\"\n", $start[$i], $end[$i], $strand[$i], $cog[$i], $gene[$i], $product[$i]);
}
close(OUT);
}
## Normal end
exit(0);
## Usage display
sub usage {
my $p = shift;
print STDERR <<USAGE
usage: $p [options] <file>
options [default]:
-h Usage display.
-i <file> Input file.
-l Genome size in bp.
-o Organism.
USAGE
}