#!/usr/local/bin/perl

# coils2pos.pl
# use for reading the directory /home/httpd/html/pub/ccp 
# and reading the data from the *.aa.ccp file,
# then get the genbank id and position for the low case letter, 
# and store them to the output file
# 
# Authors: Xuemei Tang <txmei@hotmail.com>
#          Leonardo Marino-Ramirez <marino@tofu.tamu.edu>
# 
# Please cite the authors in any work or product based on this material.
#

use strict;
use Getopt::Std;
use File::Basename;

use vars qw($opt_d $file $filename $size $length $count $se @ccp_files @f 
	    @start @end $opt_h $dirname
	    );

## Check command line
my $prog = basename($0);

getopt('hd');

if ($opt_h) {
    usage($prog); exit();
} if ($opt_d) {
$dirname = "$opt_d";
} else {
    usage($prog); exit();
}

# get the working directory containing *.aa.ccp file
if(opendir (DIR , $dirname))
{
    while( $file = readdir(DIR))
    {
	# get the *.ccp file
	push(@ccp_files, "$file") if ($file =~ /\.ccp$/);
    }
}
closedir(DIR);

# if there is any *.ccp file  
foreach $filename (@ccp_files)
{
    # get input file
    get_input(); 

    # get the position of the low case letter
    position();

    # get output file
    get_output();
}                          

# get input file function
sub get_input{
   my($i) = 0; 
   my @line;
   my $line;
   my @s;

   # open the file
   open(IN, "$dirname/$filename") || die "can't open input: $!";

   # skip the first line
   $line[0] = <IN>;

   # read the line by the end of the file
   while($line = <IN>)
   {
       # split the line by the white space
       ($f[$i], $s[$i]) = split(" ", $line);  
       $i++;
   }
   # close the file
   close(IN);
   # get the number of the element for the array f
   $size = @f;
}

# get the position 
sub position{
    $count = 0;
    # make the empty string
    $se = "";

    # connect the string
    for(my $j  = 0; $j < $size; $j++)
    {
	$se .= $f[$j];
	# clear the array
	$f[$j] = "";
    }

    # get the length of string
    $length = length($se);
    
    # get the every character from the string 
    for(my $k = 0; $k < $length; $k++)
    {
	my $upcase;
	my $UPCASE;
	# get the one character from the string
	my $char = substr($se, $k, 1);
        
	# if the character is low case by the binding operator
	if($char =~ /\b[a-z]/)
        {
	    # get the previous letter and next letter 
	    if($k == 0)
	    {
		$start[$count] = 1;
		$count++;
	    }
	    else
	    {
		# get the previous letter 
		$upcase = substr($se, $k - 1, 1);
		# get the next letter 
		$UPCASE = substr($se, $k + 1, 1);
	    }

	    # check if the previous letter is upper case letter 
	    if($upcase =~ /\b[A-Z]/)
	    {
		# write down the start position 
		$start[$count] = $k + 1;
		$count++;
	    }

	    # check if the next letter is upper case letter 
	    if($UPCASE =~ /\b[A-Z]/)
	    {
		$count--;
		# write down the end position 
		$end[$count] = $k + 1;
		$count++;
	    }
	}
    }
}

# get output file function
sub get_output{
  my($i);

  # append  and write the file
  open(OUT, ">>output");

  #printf OUT ("%s \t %s \t %d \t %d\n", $filename, $se, $length, $size);

  $filename =~ s/.aa.ccp//; 
  for($i = 0; $i < $count; $i++)
  {
     printf OUT ("%s \t %d \t %d\n", $filename, $start[$i], $end[$i]);
  }

  close(OUT);
}

## Normal end
exit(0);

## Usage display
sub usage {
  my $p = shift;
  print STDERR <<USAGE
usage: $p [options] <file>

options [default]:
    -h           Usage display.
    -d           Directory containing ccp files (*.ccp).
USAGE
}