#!/usr/bin/env perl
#pk

##########################################################################################
#	  This file is part of proteinortho.
#	  (C) 2009 Marcus Lechner
# 
#	  proteinortho is free software; you can redistribute it and/or modify
#	  it under the terms of the GNU General Public License as published
#	  by the Free Software Foundation; either version 2, or (at your
#	  option) any later version.
#
#	  proteinortho is distributed in the hope that it will be useful, but
#	  WITHOUT ANY WARRANTY; without even the implied warranty of
#	  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#	  General Public License for more details.
#
#	  You should have received a copy of the GNU General Public License
#	  along with proteinortho; see the file COPYING.  If not, write to the
#	  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
#	  Boston, MA 02111-1307, USA.	
##########################################################################################

##########################################################################################
# About
##########################################################################################
# 
# @author Paul Klemm
# @email klemmp@staff.uni-marburg.de
# @company Bioinformatics, University of Leipzig
# @version 1
# @date 11-12-2019
#
##########################################################################################

use POSIX;

my $usage = "
proteinortho_summary.pl        produces a summary on species level.
 
SYNOPSIS
 
proteinortho_summary.pl (options) GRAPH (GRAPH2)

	GRAPH	Path to the *.proteinortho-graph or *.blast-graph file generated by proteinortho. 
	GRAPH2	(optional) If you provide a blast-graph AND a proteinortho-graph, the difference is calculated (GRAPH - GRAPH2)

	Note: The *.proteinortho.tsv file does not work here (use the proteinortho-graph file)

	OPTIONS

		-format,-f	enables the table formatting instead of the plain csv output.

";

my $graphfilenameA="";
my $graphfilenameB="";
my $notableformat=1;

for(my $v = 0 ; $v < scalar @ARGV ; $v++){
	if($ARGV[$v] =~ m/^--?(help|h)$/){$help=1;}
	elsif($ARGV[$v] =~ m/^--?(format|f)$/){$notableformat=0;}
	elsif($ARGV[$v] =~ m/^-.+/){ print $usage; print STDERR "ERROR: invalid option ".$ARGV[$v]."!\n\n";exit(1);}
	elsif($graphfilenameA eq ""){$graphfilenameA = $ARGV[$v];}
	elsif($graphfilenameB eq ""){$graphfilenameB = $ARGV[$v];}
}

if ($help){
    print $usage;
    exit(0);
}
my $fail="";
if ($graphfilenameA eq ""){
    $fail.="ERROR: GRAPH not provided!\n";
}
if($fail ne ""){
	print $usage.$fail;
	exit(1);
}
our $maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}
our $split_delim="[:\t]";
our @spl_header;
our @spl;
our $last_isHeaderLine=0;
our $last_isHeaderLine=0;$isHeaderLine=1;
our $noheader=0;

my %species_matrix;
my %species_matrix_pow2;
my $currentSpeciesA;
my $currentSpeciesB;

open(my $FH,"<",$graphfilenameA) || die $!;
while(<$FH>){
	chomp;
	if($_ eq " *"){next;}
	if($_ eq "# file_a	file_b" || $_ =~ m/^# a	b/){next;}
	my @arr=split("\t",$_);
	if(substr($_,0,1) eq "#" && scalar @arr == 2){
		$currentSpeciesA=$arr[0];
		$currentSpeciesB=$arr[1];
		$currentSpeciesA=~s/^# ?//g;
	}elsif(substr($_,0,1) ne "#" && (scalar @arr == 6 || scalar @arr == 8) ){
		if(!exists $species_matrix{$currentSpeciesA}{$currentSpeciesB}){
			$species_matrix{$currentSpeciesA}{$currentSpeciesB} = 1;
			$species_matrix{$currentSpeciesB}{$currentSpeciesA} = 1;
			$species_matrix_pow2{$currentSpeciesA}{$currentSpeciesB} = 0;
			$species_matrix_pow2{$currentSpeciesB}{$currentSpeciesA} = 0;
		}else{
			$species_matrix{$currentSpeciesA}{$currentSpeciesB} ++;
			$species_matrix{$currentSpeciesB}{$currentSpeciesA} ++;
		}
	}elsif( !(substr($_,0,1) eq "#" && scalar @arr == 4) ){
		print STDERR "[STDERR] Error: wrong fromat...'$_' Please make sure you only provide *.blast-graph or *.proteinortho-graph files as input...\n";die;
	}
}
close($FH);

if($graphfilenameB ne ""){
	open(my $FH,"<",$graphfilenameB) || die $!;
	while(<$FH>){
		if($_ eq ""){next;}
		chomp;
		my @arr=split("\t",$_);
		if(substr($_,0,1) eq "#" && scalar @arr == 2){
			$currentSpeciesA=$arr[0];
			$currentSpeciesB=$arr[1];
			$currentSpeciesA=~s/^# ?//g;
		}elsif(substr($_,0,1) ne "#"){
			if(!exists $species_matrix{$currentSpeciesA}{$currentSpeciesB}){
				$species_matrix{$currentSpeciesA}{$currentSpeciesB} = 1;
				$species_matrix{$currentSpeciesB}{$currentSpeciesA} = 1;
				$species_matrix_pow2{$currentSpeciesA}{$currentSpeciesB} = 0;
				$species_matrix_pow2{$currentSpeciesB}{$currentSpeciesA} = 0;
			}else{
				$species_matrix{$currentSpeciesA}{$currentSpeciesB} --;
				$species_matrix{$currentSpeciesB}{$currentSpeciesA} --;
			}
		}
	}
	close($FH);
}

my @keys=sort keys %species_matrix;

$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();

print STDERR "\n";
my $ret= "# The adjacency matrix, the number of edges between 2 species\n";
processLine($ret);
$ret= "# file\t";

for(my $i = 0 ; $i < scalar @keys; $i++){
	if(scalar @keys>10 && !$notableformat){$ret.= "($i)\t";}
	else{$ret.=$keys[$i]."\t";}
}
$ret.= "\n";
processLine($ret);
for(my $i = 0 ; $i < scalar @keys; $i++){
	if(scalar @keys >10 && !$notableformat){
		$ret=$keys[$i]."($i)\t";
	}else{
		$ret=$keys[$i]."\t";
	}
	for(my $j = 0 ; $j < scalar @keys; $j++){
		if($i==$j){$species_matrix{$keys[$i]}{$keys[$j]}=0;}
		$ret.= $species_matrix{$keys[$i]}{$keys[$j]};
		if($j<scalar @keys -1){$ret.="\t";}
	}
	$ret.= "\n";
	processLine($ret);
}

$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
$maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}

print STDERR "\n";
$ret= "# file\taverage number of edges\n";
processLine($ret);
for(my $i = 0 ; $i < scalar @keys; $i++){
	$ret= $keys[$i]."\t";
	my $sum=0;
	for(my $j = 0 ; $j < scalar @keys; $j++){
		$sum+=$species_matrix{$keys[$i]}{$keys[$j]};
	}
	$ret.= $sum/scalar @keys;
	if($j<scalar @keys -1){$ret.= "\t";}
	$ret.= "\n";
	processLine($ret);
}

$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
$maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}

print STDERR "\n";
$ret= "# The 2-path matrix, the number of paths between 2 species of length 2\n";
processLine($ret);
$ret= "# file\t";
for(my $i = 0 ; $i < scalar @keys; $i++){
	if(scalar @keys>10 && !$notableformat){$ret.= "($i)\t";}
	else{$ret.=$keys[$i]."\t";}
}
$ret.= "\n";
processLine($ret);
for(my $i = 0 ; $i < scalar @keys; $i++){
	$ret= $keys[$i]."($i)\t";
	for(my $j = 0 ; $j < scalar @keys; $j++){
		if($i<$i+1){
			for(my $k = 0 ; $k < scalar @keys; $k++){
				$species_matrix_pow2{$keys[$i]}{$keys[$j]}+=$species_matrix{$keys[$i]}{$keys[$k]}*$species_matrix{$keys[$k]}{$keys[$j]};
				$species_matrix_pow2{$keys[$j]}{$keys[$i]}=$species_matrix_pow2{$keys[$i]}{$keys[$j]};
			}
		}
		$ret.= $species_matrix_pow2{$keys[$i]}{$keys[$j]};
		if($j<scalar @keys -1){$ret.= "\t";}
	}
	$ret.= "\n";
	processLine($ret);
}

$noheader=0;$last_isHeaderLine=0;$isHeaderLine=1;@spl_header=();@spl=();
$maxNumOfCharsInOneLine=`tput cols`;
chomp($maxNumOfCharsInOneLine);$maxNumOfCharsInOneLine/=2;
if($maxNumOfCharsInOneLine<10){$maxNumOfCharsInOneLine=160;}

print STDERR "\n";
processLine("# file\taverage number of 2-paths\n");
for(my $i = 0 ; $i < scalar @keys; $i++){
	
	my $sum=0;
	for(my $j = 0 ; $j < scalar @keys; $j++){
		$sum+=$species_matrix_pow2{$keys[$i]}{$keys[$j]};
	}
	processLine($keys[$i]."($i)\t".($sum/scalar @keys)."\n");
}



sub processLine{
	$_=shift;chomp;
	if($notableformat == 1){print "$_\n";return 1;}
	
	if(length($_)<1){return 1;}

	@spl=split($split_delim,$_);
	
	if(scalar @spl <2){print "$_\n";return 1;}

	@spl_backup=@spl;

	if(scalar @spl_header > 0 && scalar @spl != scalar @spl_header){$isHeaderLine=1;}
	if(scalar @spl < 2 ){return 1;}
	if(substr($spl[0],0,1) eq "#"){$spl[0]=~s/^# ?//g;}
	if(scalar(@spl)*2-1>$maxNumOfCharsInOneLine){$maxNumOfCharsInOneLine= -1+2*scalar @spl;print STDERR "Corrected minimum table width: -w=$maxNumOfCharsInOneLine such that at least 1 character per column is displayed.\n";}

	$sumOfCharsLine=length(join("",@spl));

	if($isHeaderLine){ # is a header row 
		while(($sumOfCharsLine + scalar @spl-1) > $maxNumOfCharsInOneLine){ # shave of chars from widest cell
			$max_l=0;
			@max_l_is;
			for (my $i = 0; $i < scalar @spl; $i++) {
				if($max_l < length $spl[$i]){$max_l=length $spl[$i];@max_l_is=();push(@max_l_is,$i)}elsif($max_l == length $spl[$i]){push(@max_l_is,$i)}
			}
			for (my $i = 0; $i < scalar @max_l_is; $i++) {
				if(length $spl[$max_l_is[$i]] > 8 && substr($spl[$max_l_is[$i]],-3) ne "..." ){
					$spl[$max_l_is[$i]]=substr($spl[$max_l_is[$i]],0,length($spl[$max_l_is[$i]])-3-1)."..."
				}
				else{
					$spl[$max_l_is[$i]]=substr($spl_backup[$max_l_is[$i]],0,length($spl[$max_l_is[$i]])-1)
				}
			}
			$sumOfCharsLine=length(join("",@spl));
		}


		while(($sumOfCharsLine + scalar @spl-1) < $maxNumOfCharsInOneLine ){ # add of chars to smallest cell
			$min_l=$maxNumOfCharsInOneLine*10;
			@min_l_is;
			for (my $i = 0; $i < scalar @spl; $i++) {
				if($min_l > length $spl[$i]){$min_l=length $spl[$i];@min_l_is=();push(@min_l_is,$i)}
			}
			for (my $i = 0; $i < scalar @min_l_is; $i++) {

				$leftPad=0;
				$rightPad=0;
				if($spl[$min_l_is[$i]]=~m/( +)$/){$rightPad=length $1}
				if($spl[$min_l_is[$i]]=~m/^( +)/){$leftPad=length $1}

				if( $leftPad < $rightPad ){
					$spl[$min_l_is[$i]]=" ".$spl[$min_l_is[$i]];
				}else{
					$spl[$min_l_is[$i]]=$spl[$min_l_is[$i]]." ";
				}
				
			}
			$sumOfCharsLine=length(join("",@spl));
		}

		@spl_header=@spl;

	}else{ # is not headerline -> do the same as in headerline
		
		while(scalar @spl > scalar @spl_header){pop @spl;}

		for (my $i = 0; $i < scalar @spl; $i++) {
			while(length $spl[$i]< length $spl_header[$i]){ # add pads
				$leftPad=0;
				$rightPad=0;
				if($spl[$i]=~m/( +)$/){$rightPad=length $1}
				if($spl[$i]=~m/^( +)/){$leftPad=length $1}

				if( $leftPad < $rightPad ){
					$spl[$i]=" ".$spl[$i];
				}else{
					$spl[$i]=$spl[$i]." ";
				}
			}
			while(length $spl[$i]>length $spl_header[$i]){ # trim
				if(length $spl[$i] > 5 && substr($spl[$i],-3) ne "..." ){
					$spl[$i]=substr($spl[$i],0,length($spl[$i])-3-1)."..."
				}
				else{
					$spl[$i]=substr($spl_backup[$i],0,length($spl[$i])-2)."#"
				}
			}
		}
	}

	if($isHeaderLine && !$last_isHeaderLine ){$tmp=join("|",@spl);$tmp=~s/\|/+/g;$tmp=~s/[^+]/-/g; print "$tmp\n";}
	print join("|",@spl);
	if($isHeaderLine ){print "\n";$tmp=join("|",@spl);$tmp=~s/\|/+/g;$tmp=~s/[^+]/-/g; print "$tmp";}
	print "\n";
	$last_isHeaderLine=$isHeaderLine;
	$isHeaderLine=0;


}

