#! /usr/local/bin/perl -w

# Copyright @ 2004 - 2014 The Institute for Genomic Research (TIGR).
# All rights reserved.
# 
# This software is provided "AS IS".  TIGR makes no warranties, express or
# implied, including no representation or warranty with respect to the
# performance of the software and derivatives or their safety,
# effectiveness, or commercial viability.  TIGR does not warrant the
# merchantability or fitness of the software and derivatives for any
# particular purpose, or that they may be exploited without infringing the
# copyrights, patent rights or property rights of others.
# 


use strict;
use TIGR::FASTAiterator;
use TIGR::Foundation;
use TIGR::FASTArecord;
use Statistics::Descriptive;
use Getopt::Long;
use POSIX;
$| = 1;

my $version = qq ~
Program:       BSR.pl v 1.0
Author:        Jacques Ravel, David Rasko, Garry Myers 
URL:           http://wwww.microbialgenomics.org/BSR/
Creation Date: 2004/10/04
Last Revision: October 13h, 2004
Revision:      4.0
~;


### DEFINE VARIABLES
my $tf = new TIGR::Foundation;
my ($coords_q1, $coords_ref, $coords_q2);
my %hashQ1 = ();
my %hashQ2 = ();
my %hashR = ();

my $query; #### .pep file for query
my $query2; #### .pep file for query2

my @filedata = ();
my $reference;   ### .pep file for reference
my ($fh, $record);
my ($gnuout, $gnu_gp,$gnu_gp1,$gnu_gp2,$gnu_ps,$gnu_ps1,$gnu_ps2,$gnu_fig,$gnu_fig1,$gnu_fig2, $gnudat, $gnuout1, $ggobi1, $ggobidat, $ggobi2, $ggobi3, $xgg2, $xgg3);
my ($score, $scoreQ1, $scoreQ2);
my ($xrange, $yrange1, $yrange2);
my ($r1, $r2);
my ($orf,$orfQ1,$orfQ2);
my $tab;
my @tab;
my @temp2;

my $color;
my @dat1;
my @dat2;
my @dat3;
my @gg2;
my @gg3;

my ($colorQ1, $colorGQ1, $colorGQ2, $colorQ2, $rx, $qy1, $qy2, $valR, $valQ1, $valQ2);
my (@tempR, @tempQ1, @tempQ2) = ();
my (@gp1, @gp2, @gp3, @gp4, @gp5, @gp6, @gp7, @gp8, @gp9) = (); 
my ($pt, $anno1, $anno2, $output2, $output3, $output4, $output5, $output6, $output);
my @stats1;
my @stats2;

my $help;
my $vers;

#---------------------------------------------------------------------------
# Set up command line arguments
#---------------------------------------------------------------------------

Getopt::Long::config("no_ignore_case");

my $result = GetOptions ("h|help"     => \$help,
			 "V|version" =>  \$vers,
			 "R=s"        => \$reference,
			 "Q2=s"       => \$query2,
			 "Q1=s"        => \$query);

if ($result == 0) {
    die ("Command line parsing failed\n")
    }

if ($help) {
    exec("perldoc $0");
}

if ($vers) {
    print $version;
    exit(0);
}

if (! defined $reference) {
    print STDERR "You must enter a reference file\n";
    exit(0);
}
if (! defined $query) {
    print STDERR "You must specify a query file name\n";
    exit(0);
}

if (! defined $query2) {
    print STDERR "You must specify a second query file name\n";
    exit(0);
}

if (!defined $tf){
    die ("Bad foundation\n");
}

my $start = time;

#############################################################################
###     SET-UP FOR OUTPUT AND COORDS FILE
#############################################################################

## grab the name of the input file without the extension
my ($ref_l) = ($reference =~ /^(.*)\..*$/);
my ($que1_l) = ($query =~ /^(.*)\..*$/);
my ($que2_l) = ($query2 =~ /^(.*)\..*$/);

## create output directory

my $dir = $ref_l ."_".$que1_l."_".$que2_l.".dir";

unless (-e $dir) {
mkdir ($dir) || die "Cannot create $dir: $!";
}

## create output file name for tab delimited, db2circle and xgraph

$output =  $ref_l ."_".$que1_l."_".$que2_l.".txt";
$output2 = $ref_l ."_only".".group";
$output3 = $ref_l ."_".$que1_l."_".$que2_l.".group";
$output4 = $ref_l ."_".$que2_l."_only.group";
$output5 = $ref_l ."_".$que1_l."_only.group";

$coords_q1 = $que1_l.".coords";
$coords_ref = $ref_l.".coords";
$coords_q2 = $que2_l.".coords";

### GNUPLOT OUTPUT

$gnuout1 = $ref_l . "_".$que1_l."_".$que2_l. ".plot"; ## all data in columns 
open (GNU, ">$dir\/$gnuout1");

#$gnuout = $ref_l ."_".$que1_l."_".$que2_l.".plot";

$gnu_gp = $que1_l . "_" .$que2_l .".gp"; ## gnuplot files BSR
open (GNUG, ">$dir\/$gnu_gp");
$gnu_ps =  $que1_l . "_" .$que2_l .".ps";
$gnu_fig =  $que1_l . "_" .$que2_l .".fig";

$gnu_gp1 = $ref_l ."_" . $que1_l . ".gp"; ## gnuplot files SYNTENY
open (GNUG1, ">$dir\/$gnu_gp1");
$gnu_ps1 = $ref_l ."_" . $que1_l . ".ps";
$gnu_fig1 = $ref_l ."_" . $que1_l . ".fig";

$gnu_gp2 = $ref_l . "_" .$que2_l .".gp"; ## gnuplot files SYNTENY
open (GNUG2, ">$dir\/$gnu_gp2");
$gnu_ps2 = $ref_l . "_" .$que2_l .".ps";
$gnu_fig2 = $ref_l . "_" .$que2_l .".fig";

#### GGOBI OUTPUT

$ggobi1 = $que1_l . "_" .$que2_l .".xml"; ## ggobi files BSR
open (GGOBI, ">$dir\/$ggobi1");

$ggobi2 = $ref_l . "_" . $que1_l . ".xml";
open (GGOBI1, ">$dir\/$ggobi2");

$ggobi3 = $ref_l . "_" . $que2_l . ".xml";
open (GGOBI2, ">$dir\/$ggobi3");


#### make bin directory with .awk files

&print_awk($dir);


#################################################################
### GET SIZE RANGE ####
#################################################################

my $ref_range = get_range($coords_ref);

my $que1_range = get_range($coords_q1);

my $que2_range = get_range($coords_q2);

### Graphical output for gnuplot: Height and length of boxes
## for Synteny plot
my $plot_size1 = ($ref_range / 1000000) * 2000;
my $plot_size2 = ($ref_range / 1000000) * 3000;

## for B3W plot
my $plot_size3 = ($ref_range / 1000000) * (2/ 1000);
my $plot_size4 = ($ref_range / 1000000) * (3/ 1000);

##################################################################
###           Print GNUPLOT control files                     ####
##################################################################
print GNUG qq~set terminal X11 
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set xlabel \"$que1_l\"
set ylabel \"$que2_l\"
set xrange [0:1]
set yrange [0:1]
set palette defined ( 1 \"red\", 2 \"dark-blue\", 3 \"orange\", 4 \"dark-green\" )
set title "$que1_l and $que2_l vs $ref_l"
unset colorbox
set pm3d map
splot '<awk -f bin/Q1Q2.awk $gnuout1 $plot_size3 $plot_size4'
pause -1 \"Hit return to continue\"
set terminal postscript color
set output "$gnu_ps"
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set xlabel \"$que1_l\"
set ylabel \"$que2_l\"
set xrange [0:1]
set yrange [1:0]
set palette defined ( 1 \"red\", 2 \"dark-blue\", 3 \"orange\", 4 \"dark-green\" )
set title "$que1_l and $que2_l vs $ref_l"
unset colorbox
set pm3d map
splot '<awk -f bin/Q1Q2.awk $gnuout1 $plot_size3 $plot_size4'
set terminal fig color landscape big
set output "$gnu_fig"
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set xlabel \"$que1_l\"
set ylabel \"$que2_l\"
set xrange [0:1]
set yrange [1:0]
set palette defined ( 1 \"red\", 2 \"dark-blue\", 3 \"orange\", 4 \"dark-green\" )
set title "$que1_l and $que2_l vs $ref_l"
unset colorbox
set pm3d map
splot '<awk -f bin/Q1Q2.awk $gnuout1 $plot_size3 $plot_size4'
~;
print GNUG1 qq~
set terminal X11 
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set format x "%.f"
set format y "%.f"
set xlabel \"$ref_l\"
set ylabel \"$que1_l\"
set xrange [0:$ref_range]
set yrange [0:$que1_range]
set title "$ref_l vs $que1_l synteny plot"
set cbrange [0:1]
set cblabel "less similarity to more similarity"
set pm3d map
splot '<awk -f bin/RQ1.awk $gnuout1 $plot_size1 $plot_size2'
pause -1 \"Hit return to continue\"
set terminal postscript color
set output "$gnu_ps1"
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set format x "%.f"
set format y "%.f"
set xlabel \"$ref_l\"
set ylabel \"$que1_l\"
set xrange [0:$ref_range]
set yrange [$que1_range:0]
set title "$ref_l vs $que1_l synteny plot"
set cbrange [0:1]
set cblabel "less similarity to more similarity"
set pm3d map
splot '<awk -f bin/RQ1.awk $gnuout1 $plot_size1 $plot_size2'
set terminal fig color landscape big
set output "$gnu_fig1"
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set format x "%.f"
set format y "%.f"
set xlabel \"$ref_l\"
set ylabel \"$que1_l\"
set xrange [0:$ref_range]
set yrange [$que1_range:0]
set title "$ref_l vs $que1_l synteny plot"
set cbrange [0:1]
set cblabel "less similarity to more similarity"
set pm3d map
splot '<awk -f bin/RQ1.awk $gnuout1 $plot_size1 $plot_size2'
~;
print GNUG2 qq~
set terminal X11 
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set format x "%.f"
set format y "%.f"
set xlabel \"$ref_l\"
set ylabel \"$que2_l\"
set xrange [0:$ref_range]
set yrange [0:$que2_range]
set title "$ref_l vs $que2_l synteny plot"
set cbrange [0:1]
set cblabel "less similarity to more similarity"
set pm3d map
splot '<awk -f bin/RQ2.awk $gnuout1 $plot_size1 $plot_size2'
pause -1 \"Hit return to continue\"
set terminal postscript color
set output "$gnu_ps2"
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set format x "%.f"
set format y "%.f"
set xlabel \"$ref_l\"
set ylabel \"$que2_l\"
set xrange [0:$ref_range]
set yrange [$que2_range:0]
set title "$ref_l vs $que2_l synteny plot"
set cbrange [0:1]
set cblabel "less similarity to more similarity"
set pm3d map
splot '<awk -f bin/RQ2.awk $gnuout1 $plot_size1 $plot_size2'
set terminal fig color landscape big
set output "$gnu_fig2"
set nokey
set boxwidth 4
set grid lt 0 lw 0.1
set format x "%.f"
set format y "%.f"
set xlabel \"$ref_l\"
set ylabel \"$que2_l\"
set xrange [0:$ref_range]
set yrange [$que2_range:0]
set title "$ref_l vs $que2_l synteny plot"
set cbrange [0:1]
set cblabel "less similarity to more similarity"
set pm3d map
splot '<awk -f bin/RQ2.awk $gnuout1 $plot_size1 $plot_size2'
~;

##################################################################
###           Print GNUPLOT control files                     ####
##################################################################

## get the total of entry in the reference file
open (COUNT, "grep -c \">\" $reference |");
my @ctp = <COUNT>;
chomp $ctp[0];

print GGOBI qq~<?xml version="1.0"?>
<!DOCTYPE ggobidata SYSTEM "ggobi.dtd">

<ggobidata>
<brush color=\"2\" glyph=\"fc 3\" />
<data name=\"$que1_l and $que2_l vs $ref_l\">
<description>
Blast3way data for $que1_l and $que2_l when compared to $ref_l
</description>
<variables count=\"4\">
  <realvariable name=\"$que1_l\" nickname=\"g1\">
  <description>Genome query 1
  </description>
  </realvariable>
  <realvariable name=\"$que2_l\" nickname=\"g2\" />
  <description>Genome query 2
  </description>
  <realvariable name=\"End5\" nickname=\"coords\">
  <description>Reference Genome coordinate
  </description>
  </realvariable>
  <categoricalvariable name=\"CATEGORIE\">
    <levels count=\"1\">
      <level value=\"1\"> GENOME PEP </level>
    </levels>
  </categoricalvariable>
</variables>
<records count=\"$ctp[0]\" color=\"2\" glyph=\"fc 0\" missingValue=\"NA\">
~;

print GGOBI1 qq~<?xml version="1.0"?>
<!DOCTYPE ggobidata SYSTEM "ggobi.dtd">

<ggobidata>
<activeColorScheme name="Spectral 10"/>
<brush color=\"9\" glyph=\"fc 3\" />
<data name=\"Synteny plot for $que1_l and $ref_l\">
<description>
Synteny Plot for $que1_l and $ref_l 
</description>
<variables count=\"3\">
  <realvariable name=\"$ref_l\" nickname=\"ref\">
  <description>Genome reference
  </description>
  </realvariable>
  <realvariable name=\"$que1_l\" nickname=\"query\" />
  <description>Genome query 
  </description>
  <categoricalvariable name=\"CATEGORIE\">
    <levels count=\"1\">
      <level value=\"1\"> GENOME PEP </level>
    </levels>
  </categoricalvariable>
</variables>
~;

print GGOBI2 qq~<?xml version="1.0"?>
<!DOCTYPE ggobidata SYSTEM "ggobi.dtd">

<ggobidata>
<activeColorScheme name="Spectral 10"/>
<brush color=\"9\" glyph=\"fc 3\" />
<data name=\"Synteny plot for $que2_l and $ref_l\">
<description>
Synteny Plot for $que2_l and $ref_l 
</description>
<variables count=\"3\">
  <realvariable name=\"$ref_l\" nickname=\"ref\">
  <description>Genome reference
  </description>
  </realvariable>
  <realvariable name=\"$que2_l\" nickname=\"query\" />
  <description>Genome query 
  </description>
  <categoricalvariable name=\"CATEGORIE\">
    <levels count=\"1\">
      <level value=\"1\"> GENOME PEP </level>
    </levels>
  </categoricalvariable>
</variables>
~;

#############################################################################
###     MAKE HASH OF COORDS FILES
#############################################################################


###### THE REFERENCE FILE

%hashR = make_hash($coords_ref);

###### QUERY 1

%hashQ1 = make_hash($coords_q1);

###### QUERY2 

%hashQ2 = make_hash($coords_q2);


#############################################################################
#############################################################################
###     3WAY BLAST
#############################################################################
#############################################################################



my @errors;

my $fr = new TIGR::FASTAiterator($tf, \@errors, $reference);

if (!defined $fr){
    die ("Bad reader\n");
}

### GETTING THE BLAST SCORE FROM BLAST

my $y = 0;
my $mt = 0;

my $gg1_ct = 0;
my $gg2_ct = 0;

print "\nPerforming Blast Score Ratio Analysis...\n\n";

while ($fr->hasNext){

    ## COUNTER DISPLAY
    $mt ++;
    if ($mt%100 == 0) {
	print $mt . "..";
    }
    if ($mt%1000 == 0) {
	print "\n";
    }
    
    ##################################
    my $rec = $fr->next();
    my $head = $rec->getHeader();
    my $body = $rec->getData();
    $record =  "$head\n". print_sequence2($body, 60);
    $y++;
    
    ## Make a temp file of the protein sequence to blast
    
    open(TEMPN, ">temp.fasta");
    print TEMPN $record;
    
    ############################
    ### GET THE BEST_HIT VALUES#
    ############################

    ($score, $orf) = blast_log($reference);
    ($scoreQ1, $orfQ1) = blast_log($query);
    ($scoreQ2, $orfQ2) = blast_log($query2);
    
    ###################################
    ## CALCULATE THE BLAST SCORE RATIO#
    ###################################
    
    ## if both gene have no blast hit (NBH), then ratio should be 0 and not 1

    if ($orfQ1 eq "NBH") {
	$r1 = 0.0001;
    }else {
	$r1 = $scoreQ1/$score;
    }
    if ($orfQ2 eq "NBH") {
	$r2 = 0.0001;
    }else {
	$r2 = $scoreQ2/$score;
    }
    
    push(@stats1, $r1);
    push(@stats2, $r2);

    my $value = $hashR{$orf};
    @temp2 = split("\t", $value);
    chomp $temp2[2];

    ### GNUPLOT OUTPUT 
    $gnudat = make_3way2($r1, $r2); ##first three column gnuplot (blast3way)
    chomp $gnudat;
    
    $valR = $hashR{$orf};
    @tempR = split("\t", $valR);
    $rx = $tempR[0];
    
    ### GGOBI OUTPUT
    $ggobidat = make_3wayggobi($r1, $r2, , $rx, $temp2[2]);
    print GGOBI $ggobidat;


    ### BTAB OUTPUT
    $tab = "$orf\t$score\t$orfQ1\t$scoreQ1\t$r1\t$orfQ2\t$scoreQ2\t$r2\t$temp2[2]\n";
    push(@tab, $tab);
    
    ### FOR OUTPUT IN GNUPLOT COLOR
        
    
    
    if ($orfQ1 eq "NBH") {
	$gnudat .= "\t \t \t ";
	$xgg2 = "";

    }else {

	$valQ1 = $hashQ1{$orfQ1};
	@tempQ1 = split("\t", $valQ1);
	$qy1 = $tempQ1[0];
	chomp $tempQ1[2];
	$gnudat .= "\t$rx\t$qy1\t$r1"; 
	$xgg2 = make_xgg($r1, $temp2[2], $tempQ1[2], $rx, $qy1);
	$gg1_ct++;

    }
    push(@gg2, $xgg2);

    if ($orfQ2 eq "NBH") {
	$gnudat .= "\t \t \t \n";
    }else{

	$valQ2 = $hashQ2{$orfQ2};
	
	@tempQ2 = split("\t", $valQ2);
	$qy2 = $tempQ2[0];
	$gnudat .= "\t$rx\t$qy2\t$r2\n"; 
	$xgg3 = make_xgg($r2, $temp2[2], $tempQ2[2], $rx, $qy2);
	$gg2_ct++;
    }
    push(@gg3, $xgg3);

    print GNU $gnudat;
    

    #############################################################################
    ### OUTPUT FOR THREE WAY GRAPH GROUPS
    #############################################################################
    
    $pt = make_parse($r1, $r2);
    
     if ($pt == 1) {
 	push(@gp1, $tab);
     }elsif ($pt == 2) {
 	push(@gp2, $tab);
     }elsif ($pt == 3) {
 	push(@gp3, $tab);
     }elsif ($pt == 4) {
 	push(@gp4, $tab);
     }
}

#############################################################################
###          PRINT OUTPUT FILES
#############################################################################

print GGOBI1 "<records count=\"$gg1_ct\" color=\"2\" glyph=\"fc 0\" missingValue=\"NA\">";
print GGOBI2 "<records count=\"$gg2_ct\" color=\"2\" glyph=\"fc 0\" missingValue=\"NA\">";


foreach my $line (@gg2) {
    print GGOBI1 $line;
}

foreach my $line (@gg3) {
    print GGOBI2 $line;
}

print GGOBI qq~<\/records>
<\/data>
<\/ggobidata>
~;

print GGOBI1 qq~<\/records>
<\/data>
<\/ggobidata>
~;

print GGOBI2 qq~<\/records>
<\/data>
<\/ggobidata>
~;

open(OUT, ">$dir\/$output");
foreach my $line (@tab) {
    print OUT $line;
}

close (OUT);


### PRINT OUTPUT FOR GROUPING

open (GP2, ">$dir\/$output2");
open (GP3, ">$dir\/$output3");
open (GP4, ">$dir\/$output4");
open (GP5, ">$dir\/$output5");

print GP2 "\n\nGROUP1: ONLY FOUND ON $ref_l - BOTH $que1_l and $que2_l < 0.4\n";
foreach my $line (@gp1) {
    print GP2 $line;
}
print GP3 "\n\nGROUP2: FOUND in  $ref_l and $que1_l and $que2_l - BOTH $que1_l and $que2_l  > 0.4\n";
foreach my $line (@gp2) {
    print GP3 $line;
}
print GP4 "\n\nGROUP3: FOUND ONLY  in $ref_l and $que2_l  -  $que2_l > 0.4 and $que1_l < 0.4 \n";
foreach my $line (@gp3) {
    print GP4 $line;
}
print GP5 "\n\nGROUP4: FOUND ONLY  in $ref_l and $que1_l  -  $que1_l > 0.4 and $que2_l < 0.4 \n";
foreach my $line (@gp4) {
    print GP5 $line;
}


close (GP2);
close (GP3);
close (GP4);
close (GP5);
close (GNUG);
close (GNUG1);
close (GNUG2);
close (GGOBI);
close (GGOBI1);
close (GGOBI2);


### TRANSFORM FILE FOR GNUPLOT

chdir ($dir)  || die "Cannot cd to $dir: $!";

#get_circle ($gnuout1, $gnuout);

my $end = time;

my $last = $end - $start;

my $time = int ($last / 60) ." minutes " .  $last%60;
print "\n\nElapsed time $time seconds\n\n\n";

chdir ("../") || die "Cannot cd to ../";


######################## STATISTICS ANALYSIS ########################


 my $stat1 = Statistics::Descriptive::Full->new();
 my $stat2 = Statistics::Descriptive::Full->new();
 $stat1->add_data(@stats1); 
 $stat2->add_data(@stats2);
 my $median1 = $stat1->median();
 my $median2 = $stat2->median();
 my $variance1 = $stat1->variance();
 my $variance2 = $stat2->variance();
 my $std1 = $stat1->standard_deviation();
 my $std2 = $stat2->standard_deviation();
 my $mean1 = $stat1->mean();
 my $mean2 = $stat2->mean();
 my $count1 = $stat1->count();
 my $count2 = $stat2->count();

my @mad1;
my @mad2;
my $absmed1;
my $absmed2;

foreach my $med1 (@stats1) {
    chomp $med1;
    $absmed1 = abs($med1 - $median1);
    push (@mad1, $absmed1);
}

foreach my $med2 (@stats2) {
    chomp $med2;
    $absmed2 = abs($med2 - $median2);
    push (@mad2, $absmed2);
}

my $statm1 = Statistics::Descriptive::Full->new();
my $statm2 = Statistics::Descriptive::Full->new();
$statm1->add_data(@mad1); 
$statm2->add_data(@mad2);
my $medabs1 = $statm1->median() * 1.4826;
my $medabs2 = $statm2->median() * 1.4826;

$medabs1 = sprintf ("%.4f", $medabs1);
$medabs2 = sprintf ("%.4f", $medabs2);
$median1 = sprintf ("%.4f", $median1);
$median2 = sprintf ("%.4f", $median2);
$variance1 = sprintf ("%.4f",$variance1);
$variance2 = sprintf ("%.4f",$variance2);
$std1 = sprintf ("%.4f",$std1);
$std2 = sprintf ("%.4f",$std2);
$mean1 = sprintf ("%.4f",$mean1);
$mean2 = sprintf ("%.4f",$mean2);
$count1 = sprintf ("%.f",$count1);
$count2 = sprintf ("%.f",$count2);

my $a1 = "PROTEIN ANALYZED";
my $t = "MEDIAN";
my $g = "MEAN";
my $c = "STD DEVIATION";
my $at = "VARIANCE";
my $cg = "MED ABS DEVIATION";
my $spce = "-";
format STDOUT_TOP =
              STATISTICAL ANALYSIS OF THE OVERALL BSR DATA
        @||||||||||||||||  @||||||||||||||||| @|||||||||||||||||
        $spce,             $que1_l,           $que2_l,
        --------------------------------------------------------
.


format STDOUT =
        @||||||||||||||||  @||||||||||||||||| @|||||||||||||||||
        $a1,                $count1,           $count2,     
        --------------------------------------------------------
        @||||||||||||||||  @||||||||||||||||| @||||||||||||||||| 
        $t,                $median1,          $median2,  
        --------------------------------------------------------
        @||||||||||||||||  @||||||||||||||||| @|||||||||||||||||
        $g,                $mean1,            $mean2,    
        --------------------------------------------------------
        @||||||||||||||||  @||||||||||||||||| @|||||||||||||||||
        $c,                $std1,             $std2,    
        --------------------------------------------------------
        @||||||||||||||||  @||||||||||||||||| @|||||||||||||||||
        $at,               $variance1,        $variance2,    
        --------------------------------------------------------
        @||||||||||||||||  @||||||||||||||||| @||||||||||||||||| 
        $cg,               $medabs1,          $medabs2,     
        --------------------------------------------------------
.

    write;



exit;

##################################################################
###  SUBROUTINE
##################################################################

# A Subroutine to Read Files into array
sub get_file_data {

    my($filename) = @_;

    use strict;
    use warnings;

    # Initialize variables
    my @filedata = (  );

    unless( open(GET_FILE_DATA, $filename) ) {
        print STDERR "Cannot open file \"$filename\"\n\n";
        exit;
    }

    @filedata = <GET_FILE_DATA>;

    close GET_FILE_DATA;

    return @filedata;
}

sub print_sequence2 {

    my($sequence, $length) = @_;

    use strict;
    
    my $seq2;

    # Print sequence in lines of $length
    for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) {
        $seq2 .= substr($sequence, $pos, $length) . "\n";
    }

    return $seq2;

}



sub make_hash {

    my ($coords) = @_;
    my @coord;
    my $orf;
    my $end5;
    my $end3;
    my $annotation;
    my @field;
    my %hash = ();
  

#### READ THE COORDS FILE INTO A ARRAY

    @coord = get_file_data($coords);

#### CREAT A HASH OF THE COORDS FILE
#### ORF# = KEYS 
#### VALUES = END3\tEND5 

    foreach my $line (@coord) {

	chomp $line;

	## SPLIT EACH FIELD

	@field = split("::", $line);

	$orf = $field[0];
	
	$end5 = $field[1];
	
	$end3 = $field[2];
	
	if (! defined $field[3]) {
	    $field[3] = " ";
	}
	
	$annotation = $field[3];
	chomp $annotation;

	### CREATE THE HASH

	$hash{$orf} = "$end5\t$end3\t$annotation";
#	print "THIS: $orf\t$end5\t$end3\t$annotation\n";
	
    }
    
    return(%hash);

}

sub get_range {

    my ($coords) = @_;
    my @coord;
    my @field;
    my $end5;
    my @range=();
    my $hrange;

    @coord = get_file_data($coords);

    foreach my $line (@coord) {

	chomp $line;

	## SPLIT EACH FIELD

	@field = split("::", $line);

	$end5 = $field[1];

	push (@range, $end5);

    }

    my @srange = sort {$a <=> $b} @range;

    $hrange = pop(@srange);
    $hrange += 5000;

    return ($hrange);
}


sub blast_log {


    my ($query) = @_;
    my @btab;
    my @btab1;
    my @temp;
    my $score;
    my $orf;
   
    system("blastall -F F -p blastp -d $query -i temp.fasta -m 8 > temp.btab");
	
    ## Open the btab output

    @btab1 = get_file_data("temp.btab");

    if (not defined $btab1[0]) {
	
	$score = 1;
	$orf = "NBH";

	return($score, $orf);
	    
    }

 #   print "$btab1[0]\n";

    ## split the btab
	
    @temp = split("\t", $btab1[0]);

    chomp $temp[11];

   
   

    $orf = $temp[1];
    $score = $temp[11];
    
    return ($score, $orf);
    
}

sub make_3way2 {

    my ($score1, $score2) = @_;
    my $gnuline;
    my $label;


if ( $score1 > 0.4 && $score2 > 0.4){
    $label = "1";
}
elsif ( $score1 > 0.4 && $score2 < 0.4){
    $label = "2";
}
elsif ( $score1 < 0.4 && $score2 < 0.4){
    $label = "3";
}
elsif ( $score1 < 0.4 && $score2 > 0.4){
    $label = "4";
}

        $gnuline = "$score1\t$score2\t$label\n";

    return($gnuline);

}

sub make_3wayggobi {

    my ($score1, $score2, $coords, $annot1) = @_;
    my $ggobiline;
    my $label;

if ( $score1 > 0.4 && $score2 > 0.4){
    $label = "1";
}
elsif ( $score1 > 0.4 && $score2 < 0.4){
    $label = "2";
}
elsif ( $score1 < 0.4 && $score2 < 0.4){
    $label = "0";
}
elsif ( $score1 < 0.4 && $score2 > 0.4){
    $label = "3";
}

        $ggobiline = "\<record label=\"$annot1\" color=\"$label\"\>\n$score1\t$score2\t$coords\n\<\/record\>\n";

    return($ggobiline);

}

sub make_parse {

    my ($s1, $s2) = @_;
    my $pt;

    if ($s1 < 0.4 && $s2 < 0.4) {
	$pt = 1;
    }elsif ( $s1 > 0.4 && $s2 > 0.4) {
	$pt = 2;
    }elsif ( $s1 < 0.4 && $s2 > 0.4) {
	$pt = 3;
    }elsif ($s1 > 0.4 && $s2 < 0.4) {
	$pt = 4;
    }

    return ($pt);
}


sub make_xgg {


    my ($score1,$annot1,$annot2, $rx, $qx) = @_;
    my $color;
    
### COLOR CODING: PARSE THE HIT SCORE
	
    if ($score1 >= 0.95) {
	
	$color = "0"; #red
	
    }elsif ($score1 >= 0.90 && $score1 < 0.95) {
	
	$color = "1"; #blue
	
    }elsif ($score1 >= 0.80 && $score1 < 0.90) {
	
	$color = "2"; #green
	
    }elsif ($score1 >= 0.70 && $score1 < 0.80) {
	
	$color = "3"; ##orange
	
    }elsif ($score1 >= 0.60 && $score1 < 0.70) {
	
	$color = "4"; ## cyan
 	
    }elsif ($score1 >= 0.50 && $score1 < 0.60) {
	
	$color = "5";  ## blue
	
    }elsif ($score1 >= 0.40 && $score1 < 0.50) {
	
	$color = "6";  ## blue
	
    }elsif ($score1 >= 0.30 && $score1 < 0.40) {
	
	$color = "7";  ## blue
	
    }elsif ($score1 >= 0.20 && $score1 < 0.30) {
	
	$color = "8";  ## blue
	
    }elsif ($score1 >= 0 && $score1 < 0.20) {
	
	$color = "9";  ## blue
	
    }
   
    my $ggsynteny = "\<record id=\"$annot2\" label=\"$annot1 ($score1)\" color=\"$color\"\>\n$rx\t$qx\n\<\/record\>\n";
    return($ggsynteny);
}


sub print_awk {

    my $outdir = shift;

    chdir ($outdir) || die "Cannot cd to $outdir: $!";

    my $dir = "bin";

    unless (-e $dir) {
	mkdir ($dir) || die "Cannot mkdir $dir: $!";
    }
    
    chdir ($dir) || die "Cannot cd to $dir: $!";

    open (A1, ">Q1Q2.awk");

    print A1 qq~
# Script for transforming points (x y z) into "rectangular surface".
# Can be used to draw colour points in gnuplot with the pm3d enhancement

BEGIN {
if (ARGC<4) {
  print "Syntax:  awk -f colorpts.awk file dx dy" >"/dev/stderr"
  print "\\nWhere 2*dx, 2*dy defines the point size (enlargement) in x, y" >"/dev/stderr"
  print "Gnuplot usage: set pm3d map; splot '<awk -f colorpts.awk pts.dat 0.01 0.01'" >"/dev/stderr"
  exit
}
dx = ARGV[2]
dy = ARGV[3]
ARGC = 2
}

# skip blank lines
NF==0 { next }

# main
{
x=\$1; y=\$2
print x-dx "\\t" y-dy "\\t" \$3
print x-dx "\\t" y+dy "\\t" \$3
printf "\\n" # blank line (new scan)
print x+dx "\\t" y-dy "\\t" \$3
print x+dx "\\t" y+dy "\\t" \$3
printf "\\n\\n" # two blank lines (new surface)
}

~;

close (A1);


open (A2, ">RQ1.awk");

    print A2 qq~# Script for transforming points (x y z) into "rectangular surface".
# Can be used to draw colour points in gnuplot with the pm3d enhancement

BEGIN {
if (ARGC<4) {
  print "Syntax:  awk -f colorpts.awk file dx dy" >"/dev/stderr"
  print "\\nWhere 2*dx, 2*dy defines the point size (enlargement) in x, y" >"/dev/stderr"
  print "Gnuplot usage: set pm3d map; splot '<awk -f colorpts.awk pts.dat 0.01 0.01'" >"/dev/stderr"
  exit
}
dx = ARGV[2]
dy = ARGV[3]
ARGC = 2
}

# skip blank lines
NF==0 { next }

# main
{
x=\$4; y=\$5
print x-dx "\\t" y-dy "\\t" \$6
print x-dx "\\t" y+dy "\\t" \$6
printf "\\n" # blank line (new scan)
print x+dx "\\t" y-dy "\\t" \$6
print x+dx "\\t" y+dy "\\t" \$6
printf "\\n\\n" # two blank lines (new surface)
}

~;


close (A2);



open (A3, ">RQ2.awk");

    print A3 qq~# Script for transforming points (x y z) into "rectangular surface".
# Can be used to draw colour points in gnuplot with the pm3d enhancement

BEGIN {
if (ARGC<4) {
  print "Syntax:  awk -f colorpts.awk file dx dy" >"/dev/stderr"
  print "\\nWhere 2*dx, 2*dy defines the point size (enlargement) in x, y" >"/dev/stderr"
  print "Gnuplot usage: set pm3d map; splot '<awk -f colorpts.awk pts.dat 0.01 0.01'" >"/dev/stderr"
  exit
}
dx = ARGV[2]
dy = ARGV[3]
ARGC = 2
}

# skip blank lines
NF==0 { next }

# main
{
x=\$7; y=\$8
print x-dx "\\t" y-dy "\\t" \$9
print x-dx "\\t" y+dy "\\t" \$9
printf "\\n" # blank line (new scan)
print x+dx "\\t" y-dy "\t" \$9
print x+dx "\\t" y+dy "\\t" \$9
printf "\\n\\n" # two blank lines (new surface)
}


~;

close (A3);


    chdir ("../") || die "Cannot cd to ../";

    chdir ("../") || die "Cannot cd to ../";
}


#---------------------------------------------------------------------------
# Versioning subroutine
#---------------------------------------------------------------------------

sub version {
    my ($date) = "\$Date$_";
    my ($rvsn) = "\$Revision$_";
    my ($rcsd) = "\$Id$_";

    $date =~ s/(.*: +)(.*?)(\s*$)/$2/g;
    $rvsn =~ s/(.*: +)(.*?)(\s*$)/$2/g;
    $rcsd =~ s/(.*: +)(.*?)(\s*$)/$2/g;

    print <<EOF;

Program:       blast3way.pl v$rvsn

Author:        Jacques Ravel 
URL:           http://www.microbialgenomics.org/BSR
Creation Date: 2004/10/01
Last Revision: $date GMT
Revision:      $rcsd

EOF
    exit;
}

#---------------------------------------------------------------------------
# POD info
#---------------------------------------------------------------------------
##
## Use "perldoc blast3way.pl" to read the man page below.
#
__END__

=head1 NAME

B<BSR.pl> - Three genome comparison visualization tool


=head1 SYNOPSIS 

B<BSR.pl> 
S<[ B<-R reference.pep -Q query1.pep -Q2 query2.pep> ]> 


=head1 DESCRIPTION

B<BSR.pl> compares a reference proteome and two query proteome. It Blasts each reference protein against itself, record the BLAST score. Blast each query protein against the reference and record the BLAST score. The BLAST SCORE RATIO is calculated by dividing each Query BLAST score by the reference BLAST Score. BLAST SCORE RATIOS are plotted as (x,y) coordinates. Synteny plots are generated by plotted the position of the hit in the second genome, and coloring the point with according to the BLAST SCORE RATIOS. The script make use of gnuplot and ggobi to visualize the output.


=head1 QUICK START

The usage is as follows:

B<BSR.pl> 
S<[ B<-R reference.pep -Q query1.pep -Q2 query2.pep> ]> 

=head1 STANDARD


B<-h --help>  Prints this information.

B<-v --version>  Prints version information.

B<-R > Reference genome peptide file in fasta format

B<-Q > Query genome 1 peptide file in fasta format

B<-Q2 > Query genome 2 peptide file in fasta format


=head1 EXAMPLE


B<BSR.pl -R> caviae.pep B<-Q> muridarum.pep B<-Q2> pneumoniae.pep


=head1 OUTPUTS

Multiple output are generated:

All output files are written into a directory named:

B<reference_query1_query2.txt>


B<A. Text files> 

B<1. reference_query1_query2.txt>

ORF00003        469.5   ORFB01531       449.1   0.956549520766773       14579ORF2383_1439332_1438649_RZC00098   468.0   0.996805111821086       hypothetical protein
ORF00004        414.5   ORFB01530       375.2   0.90518697225573        14579ORF2382_1437867_1438478_RZC07567   408.7   0.986007237635706       peptidase, M23/M37 family
ORF00005        376.3   ORFB01528       344.4   0.915227212330587       14579ORF2381_1437201_1437779_RZC05647   372.5   0.98990167419612        ATP:cob(I)alamin adenosyltransferase, putative
ORF00006        1137.5  ORFB01527       1096.6  0.964043956043956       14579ORF2380_1435289_1437061_RZC00162   1117.4  0.98232967032967        sensor histidine kinase ResE

B<2. reference_query1_query2.gp>

The different ORFs are separated in categories based on their normalized score. 

B<B. Visualization with gnuplot> 

B<1.BSR Analysis>
 
B<gnuplot query1_qery2.gp>

a postscript file is also created with the name query1_query2.ps

B<2.Synteny plot>

B<gnuplot reference_query1.gp>

or

B<gnuplot reference_query2.gp>

As for the BSR Analysis plot a postscript file is also generated with the extension .ps

B<C. Visualization with ggobi>

B<1. BSR Analysis>

B<ggobi query1_qery2.xml>

B<1. Synteny Analysis>

B<ggobi reference_query1.xml>

B<ggobi reference_query2.xml>


To visualize the BSR plot use the B<ViewMode> menu the PROJECTION MODE B<XYplot> and to view the annotation upon mouse-over of the points use INTERACTION MODE B<identity>. 
Please see the ggobi manual for more details. ggobi is available under the GNU license at B<www.ggobi.org>.


=head1 DEPENDENCIES-REQUIREMENTS


The following files have to be available to the script: 
For each of the three genomes: genome_name.pep and genome_name.coords
The coordinate files should have the same name as the .pep file with the extension .coords
They should be located in the same directory as the .pep files.

Each file has to be blastable with blastall. To do so run the following command on each .pep file

formatdb -i reference.pep

The coordinate file looks like:

feat_name::end5::end3::com_name

ORF00003::1422316::1421633::hypothetical protein
ORF00004::1420851::1421462::peptidase, M23/M37 family
ORF00005::1420185::1420763::ATP:cob(I)alamin adenosyltransferase
ORF00006::1418272::1420044::sensor histidine kinase ResE
ORF00007::1417556::1418269::DNA-binding response regulator ResD
ORF00008::1416096::1417250::resC protein
ORF00009::1414456::1416078::resB protein

To generate this file you can run the following scripts:

XXXXXXX.pl 


REQUIREMENTS: 

B<blastall>
B<gnuplot>
B<ggobi>

Libraries:
TIGR Foundation


=head1 SEE ALSO

BSR.pl    


=head1 AUTHOR AND COPYRIGHT

Jacques Ravel <jravel.org>
David Rasko <drasko.org>
Garry Myers <gmyers.org>

=head1 VERSION

Current Revision:  $Revision$
Last Modification: $Date$

=pod SCRIPT CATEGORIES
UNIX/System_administration

=head1 EXTRAS

None

=pod OSNAMES
Any
	
  
