#!/usr/bin/perl -w

################################################
# written by Belinda Giardine
################################################

#expects caller to break apart mutations with multiple changes
#numbering expected to be coding (c.*)

#gvPos format
#we only have: chrom start stop name baseChangeType 

use strict;

my $pslFile = shift @ARGV;
my $gbAcc = shift @ARGV;
my $name = shift @ARGV;
my $pslLine = '';
if (@ARGV) { $pslLine = shift @ARGV; $pslLine = "'$pslLine'"; }
my $nameCopy = $name;
if (!$name or !$pslFile or !$gbAcc) {
   print "USAGE: parseHgvsName2 file.psl GenBankAcc 'hgvs_name' ['pslLine']\n";
   exit;
}

my $converter = 'convert_coors2';
my $strand;

$name =~ s/^\w+://;#remove gene or refseq if present
$name =~ s/^c\.//; #remove if present
if ($name =~ /^\s*'*p\./) { 
   if ($name =~ /^\s*'*p\.\w(\d+)\w'*\s*$/) {
      my $pos = $1;
      my @chr = getProtCoors($pos, $gbAcc);
      if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; } 
      print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tsubstitution\t$chr[3]\n";
   }elsif ($name =~ /^\s*'*p\.[A-Z][a-z]{2}(\d+)[A-Z][a-z]{2}'*\s*$/ or
           $name =~ /^\s*'*p\.[A-Z][a-z]{2}(\d+)X'*\s*$/) {
      my $pos = $1;
      my @chr = getProtCoors($pos, $gbAcc);
      if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; }
      print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tsubstitution\t$chr[3]\n";
   }elsif ($name =~ /^\s*p\.\w(\d+)fsX*\d*\s*$/) {
      #an unspecified frameshift insertion or deletion, just mark first codon
      my $pos = $1;
      my @chr = getProtCoors($pos, $gbAcc);
      if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; }
      print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tsubstitution\t$chr[3]\n";
   }elsif ($name =~ /^\s*p\.\w(\d+)del\s*$/) {
      #deletion of codon/amino acid
      my $pos = $1;
      my @chr = getProtCoors($pos, $gbAcc);
      if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; }
      print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tdeletion\t$chr[3]\n";
   }elsif ($name =~ /^\s*p\.\w(\d+)_\w(\d+)del\s*$/) {
      #deletion of multiple codons/amino acids
      my $pos = $1;
      my $pos2 = $2;
      my @chr = getProtCoors($pos, $gbAcc);
      if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; }
      my @chr2 = getProtCoors($pos2, $gbAcc);
      if ($chr2[0] =~ /ERROR/) { print $chr2[0]; exit; }
      if ($chr[1] > $chr2[1]) {
         print "$chr[0]\t$chr2[1]\t$chr[2]\t$name\tdeletion\t$chr[3]\n";
      }else {
         print "$chr[0]\t$chr[1]\t$chr2[2]\t$name\tdeletion\t$chr[3]\n";
      }
   }else {
      print "ERROR sorry didn't recognize protein based name $name.\n";
   }
}elsif ($name =~ /delins/) {
   if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)\s*delins(.*)/) {
      my $pos = $1;
      my $ins = $2;
      my $pos2 = $pos;
      if ($pos =~ /([-*]?\d+[+*-]?\d*)_([*-]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2; }
      #convert to chrom and print 
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($pos eq $pos2 && $cst) { $cend = $cst; }
      if ($chr =~ /in a gap/) { 
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); 
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) {
         ($chr, $cend) = convert_pos($pos2);
         if ($chr =~ /in a gap/) {
            ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
         }
         if ($chr =~ /ERROR/) { print $chr; exit; }
      }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      print "$chr\t$cst\t$cend\t$nameCopy\tcomplex\t$strand\n";
   }
}elsif ($name =~ /del|inv/) {
   #need to limit front to prevent 'ex11-ex12del' as coming through as '12del'
   if ($name =~ /^[cg]?\.?([-*]?\d+[+*-]?\d*(_[-*]?\d+)?[+*-]?\d*)\s*(del|inv)([ACTG]*\d*)/) {
      my $pos = $1;
      my $t = $3;
      my $del = $4;
      my $pos2 = $pos;
      if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2; }
      elsif ($name =~ /([-*]?\d+[+*-]?\d*)del(\d+)$/) { $pos2 += ($2 - 1); }
      my $len;
      if ($del =~ /\D/) { $len = length $del; }
      elsif ($del) { $len = $del; }
      
      #allow different lengths, may be unclear which deleted
      #if ($len && $pos !~ /\d+[-+*]\d+/ && $pos2 !~ /\d+[-+*]\d+/ && $pos2 - $pos != $len - 1) {
         #print "ERROR lengths of deletion are not equal\n";
         #exit;
      #}
      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) { 
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); 
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) {
         ($chr, $cend) = convert_pos($pos2);
         if ($chr =~ /in a gap/) {
            ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
         }
         if ($chr =~ /ERROR/) { print $chr; exit; }
      }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      my $type = 'deletion';
      if ($nameCopy =~ /del[ACTG0-9]+\s{0,1}ins[0-9ACTG]+/) { $type = 'complex'; }
      if ($t eq 'inv') { $type = 'complex'; } #inversion
      print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n";
   }elsif ($name =~ /^[cg]?\.?([-*]?\d+[+*-]?(\d+|\?)?_?[-*]?\d*[+*-]?(\d+|\?)?)\s*del([ACTG]*\d*)/) {
      my $pos = $1;
      my $del = $4;
      my $pos2 = $pos;
      if ($pos =~ /([-*]?\d+[+*-]?\d*\??)_([-*]?\d+[+*-]?\d*\??)/) { $pos = $1; $pos2 = $2; }
      else { print "ERROR couldn't map fuzzy name $name"; exit; }
      $pos =~ s/\?/1/; #replace unknown with first nt
      $pos2 =~ s/\?/1/;
      my $len;
      if ($del =~ /\D/) { $len = length $del; }
      elsif ($del) { $len = $del; }

      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /ERROR/) { print $chr; exit; }
      ($chr, $cend) = convert_pos($pos2);
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      my $type = 'deletion';
      if ($nameCopy =~ /del[ACTG0-9]+\s{0,1}ins[0-9ACTG]+/) { $type = 'complex'; }
      print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n";
   }elsif ($name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)\-?\??_\(([-*]?\d+[+*-]?\d*)_\?\)\+?\??\s*(del|dup|inv)/ ||
           $name =~ /^[cg]?\.?([-*]?\d+[+*-]?\d*)_\(([-*]?\d+[+*-]?\d*)_\?\)\s*(del|
dup|inv)/ ||
           $name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)_([-*]?\d+[+*-]?\d*)\s*(del|dup|inv)/ ||
           $name =~ /^[cg]?\.?([-*]?\d+[+*-]\?)_\(([-*]?\d+[+*-]?\d*)_\?\)\+?\??\s*(del|dup|inv)/ ||
           $name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)_([-*]?\d+[+*-]\?)\s*(del|dup|inv)/ ||
           $name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)_\((\*\d+)_\?\)(del|dup|inv)/ ||
           $name =~ /[cg]?\.?([-*]?\d+[+*-]\?)_\(([-*]?\d+[+*-]?\d*)_[-*]?\d+[+*-]?\d*\)\+\?(del|dup|inv)/) {
      #have minimum size, max is unknown
      my $pos = $1;
      my $t = $3;
      my $pos2 = $2;
      if ($pos =~ /\?/) { $pos =~ s/\?/1/; }
      if ($pos2 =~ /\?/) { $pos2 =~ s/\?/1/; }
      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /ERROR/) { print $chr; exit; }
      ($chr, $cend) = convert_pos($pos2);
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      my $type = 'deletion';
      if ($t eq 'del' && $nameCopy =~ /del[ACTG0-9]+\s{0,1}ins[0-9ACTG]+/) { $type = 'complex'; }
      elsif ($t eq 'dup') { $type = 'duplication'; }
      elsif ($t eq 'inv') { $type = 'complex'; }
      print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n";
   }elsif ($name =~ /([A-Z0-9._]+):(g\.)?(\d+)_(o\.)?([A-Z0-9._]+):(g\.)?(\d+)del/) {
      #large deletion across reference sequences (assume genomic)
      my $ref = $1;
      my $pos = $3;
      my $ref2 = $5;
      my $pos2 = $7;
      $gbAcc = $ref;
      $pslLine = '';
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /ERROR/) { print $chr; exit; }
      my $off = getOffset($gbAcc);
      $cst += $off;
      $gbAcc = $ref2;
      ($chr, $cend) = convert_pos($pos2);
      if ($chr =~ /ERROR/) { print $chr; exit; }
      $off = getOffset($gbAcc);
      $cend += $off;
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      my $type = 'deletion';
      print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n";
   }else {
      print "ERROR bad deletion format for $nameCopy\n";
   }
}elsif ($name =~ /ins/) {
   if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)\s*ins[ACTGactg]*\d*/) {
      my $pos = $1;
      my $pos2 = $pos;
      if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2;}
      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) {
         ($chr, $cend) = convert_pos($pos2);
         if ($chr =~ /in a gap/) {
            ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
         }
         if ($chr =~ /ERROR/) { print $chr; exit; }
      }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      print "$chr\t$cst\t$cend\t$nameCopy\tinsertion\t$strand\n";
   }else {
      print "ERROR bad insertion format for $nameCopy\n";
   }
}elsif ($name =~ /dup/) {
   if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)dup[ACTGactg]*\d*/) {
      my $pos = $1;
      my $pos2 = $pos;
      if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2;}
      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) {
         ($chr, $cend) = convert_pos($pos2);
         if ($chr =~ /in a gap/) {
            ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
         }
         if ($chr =~ /ERROR/) { print $chr; exit; }
      }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      print "$chr\t$cst\t$cend\t$nameCopy\tduplication\t$strand\n";
   }elsif ($name =~ /([-*]?\d+[+*-]?\d*)dup[ACTGactg]*\d*/) {
      my $pos = $1;
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos, $chr);
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) { $cend = $cst; }
      print "$chr\t$cst\t$cend\t$nameCopy\tduplication\t$strand\n";
   }elsif ($name =~ /\(\?_([-*]?\d+)\)_\(([-*]?\d+)_\?\)dup/) {
      my $pos = $1;
      my $pos2 = $2;
      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) {
         ($chr, $cend) = convert_pos($pos2);
         if ($chr =~ /in a gap/) {
            ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
         }
         if ($chr =~ /ERROR/) { print $chr; exit; }
      }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      #this should be marked as estimated coordinates by caller
      print "$chr\t$cst\t$cend\t$nameCopy\tduplication\t$strand\n";
   }else {
      print "ERROR bad duplication format for $nameCopy\n";
   }
}elsif ($name =~ /inv/) {
   if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)inv\d*/) {
      my $pos = $1;
      my $pos2 = $pos;
      if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2;}
      #convert to chrom and print in gvPos format
      my($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) {
         ($chr, $cend) = convert_pos($pos2);
         if ($chr =~ /in a gap/) {
            ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
         }
         if ($chr =~ /ERROR/) { print $chr; exit; }
      }
      if ($cend < $cst) { #-strand need to switch order
         my $t = $cend;
         $cend = $cst;
         $cst = $t;
      }
      #should we add an inversion type?
      print "$chr\t$cst\t$cend\t$nameCopy\tcomplex\t$strand\n";
   }else {
      print "ERROR bad inversion format for $nameCopy\n";
   }
}elsif ($name =~ /\>/) {
   if ($name =~ /([-*]?\d+[+*-]?\d*)\s*[ACTGactg]\>[ACTGactg]/) {
      my $pos = $1;
      my ($chr, $cst) = convert_pos($pos);
      my $cend;
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos, $chr);
      }
      if (!$chr or $chr =~ /ERROR/) { print $chr; exit; }
      if (!$cend) { $cend = $cst; }
      if (!$cst) { print "ERROR missing start for $nameCopy and $chr\n"; exit; }
      print "$chr\t$cst\t$cend\t$nameCopy\tsubstitution\t$strand\n";
   }else {
      print "ERROR bad substitution format [$name]\n";
   }
}elsif ($name =~ /([-*]?\d+[+*-]?\d*)\s*([ACTG]+)\(\d+(_\d+)?\)/ or
        $name =~ /([-*]?\d+[+*-]?\d*)\s*([ACTG]+)\[\d+(_\d+)?\]/) {
   #repeating nts
   my $pos = $1;
   my $nts = $2;
   my $len = length $nts;
   my $pos2;
   if ($pos =~ /(\d+)([+-]\d+)/) {
      my $t = $2 + $len - 1;
      if ($t > 0) { $t = "+$t"; }
      $pos2 = "$1$t";
   }elsif ($pos =~ /(\d+)\*(\d+)/) {
      my $t = $2 + $len - 1;
      $pos2 = "$1*$t";
   }else {
      $pos2 = $pos + $len - 1;
   }
   #convert to chrom and print in gvPos format
   my($chr, $cst) = convert_pos($pos);
   my $cend;
   if ($chr =~ /in a gap/) {
      ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
   }
   if ($chr =~ /ERROR/) { print $chr; exit; }
   if (!$cend) {
      ($chr, $cend) = convert_pos($pos2);
      if ($chr =~ /in a gap/) {
         ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr);
      }
      if ($chr =~ /ERROR/) { print $chr; exit; }
   }
   if ($cend < $cst) { #-strand need to switch order
      my $t = $cend;
      $cend = $cst;
      $cst = $t;
   }
   print "$chr\t$cst\t$cend\t$nameCopy\tcomplex\t$strand\n";
}else {
   print "ERROR didn't recognize format of $nameCopy\n";
}

exit;

#this converts a coding position to a chrom position
sub convert_pos {
   my $pos = shift @_;
   my $chr;
   my $cnum;
   my $fh;
   open($fh, "$converter $pslFile $gbAcc '$pos' $pslLine 2>&1 $pslLine |")
      or die "ERROR Couldn't run convert_coors2, $!\n";
   while (<$fh>) {
      if (/ERROR/) { $chr = $_; last; }
      /is mapped to (\w+\d*) (\d+) (\+|\-)/;
      $cnum = $2;
      $chr = $1;
      $strand = $3;
   }
   if (!$chr) { close $fh; return("ERROR couldn't convert '$pos'", undef); }
   close $fh or die "ERROR Couldn't finish convert_coors run with $converter $pslFile $gbAcc $pos $pslLine, $!\n";
   if ($chr =~ /ERROR/ && $pos !~ /\d+[\+\*-]\d+/ && $chr =~ /end of gene at (\d+)/) {
      my $end = $1;
      my $diff = $pos - $end;
      open($fh, "$converter $pslFile $gbAcc $end+$diff $pslLine 2>&1 |")
      or die "ERROR Couldn't run convert_coors2, $!\n";
      while (<$fh>) {
         if (/ERROR/) { $chr = $_; last; }
         /is mapped to (\w+\d*) (\d+) (\+|\-)/;
         $cnum = $2;
         $chr = $1;
         $strand = $3;
      }
      close $fh or die "ERROR Couldn't finish convert_coors run with $converter $pslFile $gbAcc $end+$diff $pslLine, $!\n";
   }
   return($chr, $cnum);
}
####End 

#this redoes mutations in gaps as if an insertion in the chrom coors
sub redo_gap {
   my $pos = shift;
   my $pos2 = shift;
   my $chr = shift;
   my $cst;
   my $cend;
   while ($chr =~ /in a gap/) {
      $pos--;
      if ($pos <= 0) { last; }
      ($chr, $cst) = convert_pos($pos);
   }
   if ($chr =~ /ERROR/) { return ($chr, $cst, $cend); }
   ($chr, $cend) = convert_pos($pos2);
   while ($chr =~  /in a gap/) {
      $pos2++;
      ($chr, $cend) = convert_pos($pos2);
   }
   $nameCopy .= ' #insertion in reference#';
   if ($cst > $cend) { #minus strand switch order
      my $t = $cst;
      $cst = $cend;
      $cend = $t;
   }
   if ($cend - $cst > 10) { $chr = 'ERROR did not map to chrom sequence'; }
   return ($chr, $cst, $cend);
}
####End

sub getProtCoors {
    my $pos = shift;
    my $acc = shift;
    my @chr;
    my $fh;
    my $command = "convert_prot_coors3 $pslFile $acc $pos 2>&1 $pslLine |";
    open ($fh, $command)
      or die "ERROR Couldn't run convert_prot_coors3 $pslFile $acc $pos, $!\n";
    while (<$fh>) {
        chomp;
        if (/ERROR/) { $chr[0] = $_; last; }
        @chr = split(/\t/);
    }
    close $fh or die "Couldn't finish convert_prot_coors3 $pslFile $acc $pos $pslLine, $!\n";
    return @chr;
}
####End

sub getOffset {
   my $acc = shift;
   open(FH, '<', $pslFile) or die "ERROR couldn't open gene decription file, $pslFile, $!\n";
   my @f;
   my $off = 0;
   while (<FH>) {
      chomp;
      if (!defined or $_ eq '') { next; }
      if (/\s*#/) { next; } #comment
      @f = split(/\s+/);
      if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; }
      elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads
      if ($f[9] eq $acc) { last; }
      elsif ($f[9] =~ /^$acc:(\d+)$/) { $off = $1; last; } #have seq with offset
   }
   close FH or die "ERROR reading psl file, $pslFile $!\n";
   if ($f[9] !~ /$acc/) { die "ERROR failed to find $acc in psl file\n"; }
   return $off;
}
####End
