#!/usr/local/bin/perl

# doodle2gff
# this script converts .ptt.doodle files to GFF format
# 
# Authors: Xuemei Tang <txmei@hotmail.com>
#          Leonardo Marino-Ramirez <marino@tofu.tamu.edu>
# 
# Please cite the authors in any work or product based on this material.
#
use strict;
use Getopt::Std;
use File::Basename;

use vars qw($file @start @end @strand @length @pid @gene @synonym @spid 
	    @cog @product $size $infile $length $organism
	    $opt_i $opt_l $opt_o $opt_h
	    );

## Check command line
my $prog = basename($0);

getopt('hilo');

if ($opt_h) {
    usage($prog); exit();
} if ($opt_i) {
    $infile = "$opt_i";
} if ($opt_l) { 
# include genome size here
    $length = "$opt_l";
} if ($opt_o) { 
    $organism = "$opt_o";
} else {
    usage($prog); exit();
}

# get input file
get_input(); 

# get output file
get_output();

# get input file function
sub get_input{
   my($i) = 0; 
   my @line;
   my $line;

   # open the file
   open(IN, $infile) || die "can't open input: $!";

   # read the line by the end of the file
   while($line = <IN>)

   {
       # split the line by the white space
       ($pid[$i], $gene[$i], $start[$i], $end[$i], $strand[$i], $length[$i], $synonym[$i], $spid[$i], $product[$i], $cog[$i]) = split("\t", $line);  
       $i++;
   }
   # close the file
   close(IN);
   
   # get the number of the element for the array location
   $size = @pid;

   for(my $j = 0; $j < $size; $j++)
   {
       # get rid of " "
       $strand[$j] =~ s/ //;
       # replace ";" with ","
       $product[$j] =~ s/\;/\,/;
       # Remove newline
       chomp $cog[$j];
       #tidy up null spids
       if ($spid[$j] eq "") {
	   $spid[$j] = "-";
       }
   }
 
}

# get output file function
sub get_output{
  my($i);

  # write the result to the output file
  open(OUT, ">doodle.gff");

  printf OUT ("%s\tchromosome\tComponent\t1\t%d\t.\t.\t.\tSequence \"%s\"\n", $organism, $length, $organism);
  for($i = 0; $i < $size; $i++)
  {
     printf OUT ("$organism\tdoodle\tgene\t%d\t%d\t.\t%s\t.\tGene \"%s\" ; Note \"%s\"\n", $start[$i], $end[$i], $strand[$i], $gene[$i], $product[$i]);
     printf OUT ("$organism\tdoodle\tORF\t%d\t%d\t.\t%s\t.\tORF \"%s\" ; Note \"%s\\; %s\"\n", $start[$i], $end[$i], $strand[$i], $synonym[$i], $gene[$i], $product[$i]);
     printf OUT ("$organism\tdoodle\tSwissProt\t%d\t%d\t.\t%s\t.\tSwissProt \"%s\" ; Note \"%s\\; %s\"\n", $start[$i], $end[$i], $strand[$i], $spid[$i], $gene[$i], $product[$i]);
     printf OUT ("$organism\tdoodle\tCOG\t%d\t%d\t.\t%s\t.\tCOG \"%s\" ; Note \"%s\\; %s\"\n", $start[$i], $end[$i], $strand[$i], $cog[$i], $gene[$i], $product[$i]);
  }

  close(OUT);
}

## Normal end
exit(0);

## Usage display
sub usage {
  my $p = shift;
  print STDERR <<USAGE
usage: $p [options] <file>

options [default]:
    -h           Usage display.
    -i <file>    Input file.
    -l           Genome size in bp.
    -o           Organism.
USAGE
}