#!/usr/bin/perl -w

use strict;

#PAH uses HGVS nomenclature except 3' end, where numbers just continue
#CHECK compound entries to see if need type to be complex (1)
my $build = 'hg17';
my $genomeVarFile = 'gvPah.txt';
my $posFile = "gvPosPah.$build.txt";
my $linkFile = 'gvLinkPah.txt'; 
my $attrFile = 'gvAttrPah.txt'; 
my $badFile = 'unparsedPah.txt';
my $script = "../parseHgvsName2 U49897.$build.psl U49897.1";
my $faFile = 'U49897.cds.fa';
my $genomicFile = 'AF404777.fa';
my $genomicOffset = 27778; #nts before ATG, where PAH starts 1

my $fh;
my $fh2;
my $fh3;
my $fh4;
my $bfh;
open($fh, ">", $genomeVarFile) or die "Couldn't open output file, $!\n";
open($fh2, ">", $posFile) or die "Couldn't open output file2, $!\n";
open($fh3, ">", $attrFile) or die "Couldn't open output file3, $!\n";
open($fh4, ">", $linkFile) or die "Couldn't open output file4, $!\n";
open($bfh, ">", $badFile) or die "Couldn't open output fileb, $!\n";
my $fcnt = 0;
my $bcnt = 0;
my $verCnt = 0;
my %pkey; #IDs are primary key in gv table
my %intron = ( #numbers from PAHdb website
#  '1'=>'61-4232',
   #codingNt=>firstIntronNt(genomic)
   60=>61,
   61=>4232,
#  '2'=>'4341-22214',
   168=>4341,
   169=>22214,
#  '3'=>'22399-39585',
   352=>22399,
   353=>39585,
#  '4'=>'39675-50549',
   441=>39675,
   442=>50549,
#  '5'=>'50618-61889',
   509=>50618,
   510=>61889,
#  '6'=>'62087-64271',
   706=>62087,
   707=>64271,
#  '7'=>'64408-65465',
   842=>64408,
   843=>65465,
#  '8'=>'65536-70272',
   912=>65536,
   913=>70272,
#  '9'=>'70330-72792',
   969=>70330,
   970=>72792,
#  '10'=>'72889-73444',
   1065=>72889,
   1066=>73444,
#  '11'=>'73579-76708',
   1199=>73579,
   1200=>76708,
#  '12'=>'76825-78005',
   1315=>76825,
   1316=>78005,
);


print "Writing files $genomeVarFile, $posFile, $attrFile and $badFile\n";
#genomeVar format
#chrom start stop name mutId srcId baseChangeType location accuracy
my $srcId = 'PAHdb';
my $accuracy = 1; #allow inexact deletions
my $strand;

my %names;  #parse data from hgvs style name
my $r = 0;
while (<>) {
   chomp;
   my $line = $_;
   if (!defined $line or $line eq '' or $line =~ /^\s*#/) { next; } 
   #had 2 different file formats, comment/uncomment depending on format
   #if ($line =~ /\*\d+\*/) { $r = 1; next; }
   $r++;
   #if ($r == 3) { 
      #$line =~ s/^\s+//;
      parseDataLine($line);
   #}   
}

close $fh or die "Couldn't close outfile, $!\n";
close $fh2 or die "Couldn't close outfile2, $!\n";
close $fh3 or die "Couldn't close outfile3, $!\n";
close $fh4 or die "Couldn't close outfile4, $!\n";
close $bfh or die "Couldn't close outfileb, $!\n";

print "found numbers for $fcnt entries\n",
      "verified sequence for $verCnt entries\n",
      "found $bcnt entries that couldn't parse\n";
exit;

#this calls the script that parses by hgvs name
sub parse_hgvs_name {
   my $name = shift;
   my $chr;
   my $st;
   my $end;
   my $type;
   my $fh2;
   my $n = $name; #escape for shell
   #remove uncertaintities for dels
   if ($n =~ /c\.(\d+)[+-]\?_(\d+)[+-]\?del/) {
      $accuracy = 0;
      $n = "c.${1}_${2}del";
   }else {
      $accuracy = 1;
   }
   $n =~ s/'/\\'/;
   open ($fh2, "$script '$n' 2>&1 |") 
      or die "Couldn't run parseHgvsName2, $!\n";
   while (<$fh2>) {
      chomp;
      if (/ERROR/) { $chr = $_; next; }
      my $t; #extra throw out
      if (!$chr) {
         ($chr, $st, $end, $t, $type, $strand) = split(/\t/);
      }
   }
   close $fh2 or die "Couldn't finish parseHgvsName run for $name, $!\n";
   $st--; #switch to ucsc numbers
   return ($chr, $st, $end, $name, $type);
}
####End 

#this guesses the location based on the name
sub loc_from_name {
   my $name = shift @_;
   my $loc = 'unknown';
   if ($name =~ /c\.(\-?\d+[+-]?\d*)/) {
      my $pos = $1;
      if ($accuracy == 0) {
         $loc = 'not within known transcription unit';
      }elsif ($pos =~ /\d+[+-]\d+/) { 
         $loc = 'intron';
      }elsif ($pos < -472) { 
         $loc = 'not within known transcription unit';
      }elsif ($pos < 0) {
         $loc = '5\' UTR';
      }elsif ($pos <= 1359) {
         $loc = 'exon';
      }elsif ($pos <= 2202) {
         $loc = '3\' UTR';
      }else {
         $loc = 'not within known transcription unit';
      }
   }
   return $loc;
}
####End 

sub loc_from_region {
   my $reg = shift @_;
   my $loc;
   if (!$reg) { return; }
   if ($reg =~ /^E\d+$/) {
      $loc = 'exon';
   }elsif ($reg =~ /^I\d+$/) {
      $loc = 'intron';
   }elsif ($reg =~ /5' UTR/) {
      $loc = '5\' UTR';
   }elsif ($reg =~ /3' UTR/) {
      $loc = '3\' UTR';
   }
   return $loc;
}
####End 

#this parses a data line with fields
#Mutation ID, Systematic Name, Mutation Name, Other Name, Region, Mutation Type
sub parseDataLine {
   my $line = shift @_;
   my @f = split(/\t/, $line);
   #shift around file format 2 to match first part of file format 1
   shift @f; #remove count
   shift @f; #remove reference ID
   splice(@f, 1, 1); #remove Nucleotide No.
   splice(@f, 2, 1); #remove Amino Acid
   #end of shifting list around
   my @chr;
   my $loc;
   $f[1] =~ s/^\s+//;
   $f[1] =~ s/\s+$//;
   if ($f[1] !~ /;/) { 
      #check sequence based on hgvs style name
      my $ch = sequenceCheck($f[1]);
      if (!$ch) {
         print $bfh $line, "\n";
         $bcnt++;
         return;
      }elsif ($ch == 2) { $verCnt++; }
      @chr = parse_hgvs_name($f[1]);
      if (@chr && $chr[0] !~ /ERROR/) {
         $loc = loc_from_region($f[4]);
         if (exists $pkey{"PAH_$f[1]"}) {
            print "WARNING duplicate ID PAH_$f[1]\n";
         }else {
            $pkey{"PAH_$f[1]"} = 1;
         }
         if (!$loc) { $loc = loc_from_name($f[1]); }
         print $fh2 "$chr[0]\t$chr[1]\t$chr[2]\tPAH_$f[1]\t$strand\tU49897.1(PAH):$chr[3]\n";
         print $fh "PAH_$f[1]\tPAH $chr[3]\t$srcId\t$chr[4]\t$loc\t$accuracy\n";
         #print aliases to a file also
         $f[2] =~ s/^\s+//;
         $f[2] =~ s/\s+$//;
         $f[3] =~ s/^\s+//;
         $f[3] =~ s/\s+$//;
         $f[0] =~ s/^\s+//;
         $f[0] =~ s/\s+$//;
         print $fh3 "PAH_$f[1]\tcommonName\t$f[2]\n";
         print $fh3 "PAH_$f[1]\talias\t$f[3]\n";
         print $fh4 "PAH_$f[1]\tsrcLink\tPAHdb\t$f[0]\t\n";
         if ($f[5] && $f[5] ne '') {
            $f[5] =~ s/^\s+//;
            $f[5] =~ s/\s+$//;
            print $fh3 "PAH_$f[1]\tmutType\t$f[5]\n";
            #status based on type? check intersect with conserved later?
            if ($f[5] =~ /missense|nonsense|splice/i or
                ($f[5] =~ /deletion|insertion/i && $chr[4] eq 'exon')) {
                print $fh3 "PAH_$f[1]\tdisease\tlikely to be phenotype-associated\n";
            }
         }
         if ($f[5] && $f[5] =~ /missense|nonsense/i) {
            my $t = $f[2];
            $t =~ s/^p.//;
            print $fh3 "PAH_$f[1]\tprotEffect\t$t\n";
         }
         #spID P00439 is PAH gene (omimID 261600)
         #provide link to OMIM gene
         print $fh4 "PAH_$f[1]\tgeneVarsDis\tomimTitle2\t261600\t\n";
         $fcnt++;
      }else {
         print $bfh $line, "\n";
         $bcnt++;
      }
   }else { #compound
      my @t = split(/;/, $f[1]);
      my $do_alias = 0;
      #only print once for gv file
      $loc = loc_from_region($f[4]);
      if (!$loc) { $loc = loc_from_name($f[1]); }
      if (exists $pkey{"PAH_$f[1]"}) {
         print "WARNING duplicate ID PAH_$f[1]\n";
      }else {
         $pkey{"PAH_$f[1]"} = 1;
      }
      print $fh "PAH_$f[1]\t$f[1]\t$srcId\tcomplex\t$loc\t1\n";
      foreach my $t (@t) {
         $t =~ s/\[//; #remove brackets if used
         $t =~ s/\]//;
         #check sequence based on hgvs style name
         my $ch = sequenceCheck($t);
         if (!$ch) {
            print $bfh $line, "\n";
            $bcnt++;
            return;
         }elsif ($ch == 2) { $verCnt++; }
         @chr = parse_hgvs_name($t);
         if (!$accuracy) {
            #this will not match what was recorded above
            print STDERR "Warning accuracy set to 0 for compound mutation ($f[1])\n";
         }
         if (@chr && $chr[0] !~ /ERROR/) {
            print $fh2 "$chr[0]\t$chr[1]\t$chr[2]\tPAH_$f[1]\t$strand\tU49897.1(PAH):c.[..$t..]\n";
            #print aliases to a file also
            $do_alias = 1;
         }else {
            print $bfh $line, "\n";
            #$do_alias = 0;
            $bcnt++;
            last;
         }
      }
      #only print aliases once
      if ($do_alias) {
         $fcnt++;
         if (!$f[2] or !$f[3] or !$f[0]) { die "ERROR in $f[1]\n"; }
         $f[2] =~ s/^\s+//;
         $f[2] =~ s/\s+$//;
         $f[3] =~ s/^\s+//;
         $f[3] =~ s/\s+$//;
         $f[0] =~ s/^\s+//;
         $f[0] =~ s/\s+$//;
         print $fh3 "PAH_$f[1]\tcommonName\t$f[2]\n";
         print $fh3 "PAH_$f[1]\talias\t$f[3]\n";
         print $fh4 "PAH_$f[1]\tsrcLink\tPAHdb\t$f[0]\t\n";
         if ($f[5] && $f[5] ne '') {
            $f[5] =~ s/^\s+//;
            $f[5] =~ s/\s+$//;
            print $fh3 "PAH_$f[1]\tmutType\t$f[5]\n";
            #status based on type? check intersect with conserved later?
            if ($f[5] =~ /missense|nonsense|splice/i or
                ($f[5] =~ /deletion|insertion/i && $chr[4] eq 'exon')) {
                print $fh3 "PAH_$f[1]\tdisease\tlikely to be phenotype-associated\n";
            }
         }
         if ($f[5] && $f[5] =~ /missense|nonsense/) {
            print $fh3 "PAH_$f[1]\tprotEffect\t$f[2]\n";
         }
         #spID P00439 is PAH gene (omimID 261600)
         #provide link to OMIM gene
         print $fh4 "PAH_$f[1]\tgeneVarsDis\tomimTitle2\t261600\t\n";
      }
   }
}
####End 

sub sequenceCheck {
   my $name = shift @_;
   my $verified = 0;
   my $bad;
   my $file = $faFile;
   if ($name =~ /(\d+[+-]\d+_*\d*[+-]*\d*)/) { #intron
      #change to genomic so can check
      my $pos = $1;
      my $st;
      my $end;
      if ($pos =~ /(\d+[+-]\d+)_(\d+[+-]\d+)/) { 
         $st = $1;
         $end = $2;
      }else { $st = $pos; }
      $st =~ /(\d+)([+-]\d+)/;
      my $x = $1;
      my $off = $2;
      if ($off > 0) {
         $st = $genomicOffset + $intron{$x} + $off - 1;
      }else {
         $st = $genomicOffset + $intron{$x} + $off + 1;
      }
      if ($end) {
         $end =~ /(\d+)([+-]\d+)/;
         my $x = $1;
         my $off = $2;
         if ($off > 0) {
            $end = $genomicOffset + $intron{$x} + $off - 1;
         }else {
            $end = $genomicOffset + $intron{$x} + $off + 1;
         }
         $name =~ s/c\.$pos/g\.${st}_$end/;
      }else {
         $pos =~ s/\+/\\+/; #why do I need this here and not above?
         $name =~ s/c\.$pos/g\.$st/g;
      }
      $file = $genomicFile;
   }elsif ($name =~ /c\.(-\d+_*-*\d*)/) { #utr
      my $pos = $1;
      my $st = $pos;
      my $end;
      if ($pos =~ /(-\d+)_(-*\d+)/) {
         $end = $2;
         $st = $1;
      }
      $st = $st + $genomicOffset + 1;
      if ($end) { 
         $end = $end + $genomicOffset + 1; 
         $name =~ s/c\.$pos/g\.${st}_$end/;
      }else {
         $name =~ s/c\.$pos/g\.$st/;
      }
      $file = $genomicFile;
   }
   if ($name =~ /'/) { return undef; } #illegal char can't check
   open(CFH, "../sequenceCheck $file '$name' 2>&1 |")
      or die "Couldn't run sequenceCheck, $!\n";
   my $test;
   while (<CFH>) {
      chomp;
      if (/Sequence doesn't match/) { $bad = $_; }
      elsif (/Sequence matched$/) { $verified++; }
      else { $test = $_; }
   }
   close(CFH)
      or die "Couldn't finish sequenceCheck with ../sequenceCheck $faFile '$name', $!:exit status $?\n";
   if ($bad) {
      print $bfh "#$bad\n";
      return 0;
   }elsif ($verified) {
      return 2;
   }else {
      #print $fh "#TESTING not checked sequence for $name got $test\n";
      return 1;
   }
}
####End

#how to do links:
#by id (per author?)
#http://www.pahdb.mcgill.ca/cgi-bin/pahdb/pahdbsearch.cgi?Field=id_mut&OrderedField=nucl_no&Value=554&SortType=Asc&Search=Mutation&Go=1&ToShowFrom=0&F2S1=id_mut&F2S2=nucl_no&F2S3=syst_name&F2S5=amino_acid&F2S6=mut_name&F2S7=other_name&F2S17=Mutation.comment