#!/usr/bin/perl -w use strict; #PAH uses HGVS nomenclature except 3' end, where numbers just continue #CHECK compound entries to see if need type to be complex (1) my $build = 'hg17'; my $genomeVarFile = 'gvPah.txt'; my $posFile = "gvPosPah.$build.txt"; my $linkFile = 'gvLinkPah.txt'; my $attrFile = 'gvAttrPah.txt'; my $badFile = 'unparsedPah.txt'; my $script = "../parseHgvsName2 U49897.$build.psl U49897.1"; my $faFile = 'U49897.cds.fa'; my $genomicFile = 'AF404777.fa'; my $genomicOffset = 27778; #nts before ATG, where PAH starts 1 my $fh; my $fh2; my $fh3; my $fh4; my $bfh; open($fh, ">", $genomeVarFile) or die "Couldn't open output file, $!\n"; open($fh2, ">", $posFile) or die "Couldn't open output file2, $!\n"; open($fh3, ">", $attrFile) or die "Couldn't open output file3, $!\n"; open($fh4, ">", $linkFile) or die "Couldn't open output file4, $!\n"; open($bfh, ">", $badFile) or die "Couldn't open output fileb, $!\n"; my $fcnt = 0; my $bcnt = 0; my $verCnt = 0; my %pkey; #IDs are primary key in gv table my %intron = ( #numbers from PAHdb website # '1'=>'61-4232', #codingNt=>firstIntronNt(genomic) 60=>61, 61=>4232, # '2'=>'4341-22214', 168=>4341, 169=>22214, # '3'=>'22399-39585', 352=>22399, 353=>39585, # '4'=>'39675-50549', 441=>39675, 442=>50549, # '5'=>'50618-61889', 509=>50618, 510=>61889, # '6'=>'62087-64271', 706=>62087, 707=>64271, # '7'=>'64408-65465', 842=>64408, 843=>65465, # '8'=>'65536-70272', 912=>65536, 913=>70272, # '9'=>'70330-72792', 969=>70330, 970=>72792, # '10'=>'72889-73444', 1065=>72889, 1066=>73444, # '11'=>'73579-76708', 1199=>73579, 1200=>76708, # '12'=>'76825-78005', 1315=>76825, 1316=>78005, ); print "Writing files $genomeVarFile, $posFile, $attrFile and $badFile\n"; #genomeVar format #chrom start stop name mutId srcId baseChangeType location accuracy my $srcId = 'PAHdb'; my $accuracy = 1; #allow inexact deletions my $strand; my %names; #parse data from hgvs style name my $r = 0; while (<>) { chomp; my $line = $_; if (!defined $line or $line eq '' or $line =~ /^\s*#/) { next; } #had 2 different file formats, comment/uncomment depending on format #if ($line =~ /\*\d+\*/) { $r = 1; next; } $r++; #if ($r == 3) { #$line =~ s/^\s+//; parseDataLine($line); #} } close $fh or die "Couldn't close outfile, $!\n"; close $fh2 or die "Couldn't close outfile2, $!\n"; close $fh3 or die "Couldn't close outfile3, $!\n"; close $fh4 or die "Couldn't close outfile4, $!\n"; close $bfh or die "Couldn't close outfileb, $!\n"; print "found numbers for $fcnt entries\n", "verified sequence for $verCnt entries\n", "found $bcnt entries that couldn't parse\n"; exit; #this calls the script that parses by hgvs name sub parse_hgvs_name { my $name = shift; my $chr; my $st; my $end; my $type; my $fh2; my $n = $name; #escape for shell #remove uncertaintities for dels if ($n =~ /c\.(\d+)[+-]\?_(\d+)[+-]\?del/) { $accuracy = 0; $n = "c.${1}_${2}del"; }else { $accuracy = 1; } $n =~ s/'/\\'/; open ($fh2, "$script '$n' 2>&1 |") or die "Couldn't run parseHgvsName2, $!\n"; while (<$fh2>) { chomp; if (/ERROR/) { $chr = $_; next; } my $t; #extra throw out if (!$chr) { ($chr, $st, $end, $t, $type, $strand) = split(/\t/); } } close $fh2 or die "Couldn't finish parseHgvsName run for $name, $!\n"; $st--; #switch to ucsc numbers return ($chr, $st, $end, $name, $type); } ####End #this guesses the location based on the name sub loc_from_name { my $name = shift @_; my $loc = 'unknown'; if ($name =~ /c\.(\-?\d+[+-]?\d*)/) { my $pos = $1; if ($accuracy == 0) { $loc = 'not within known transcription unit'; }elsif ($pos =~ /\d+[+-]\d+/) { $loc = 'intron'; }elsif ($pos < -472) { $loc = 'not within known transcription unit'; }elsif ($pos < 0) { $loc = '5\' UTR'; }elsif ($pos <= 1359) { $loc = 'exon'; }elsif ($pos <= 2202) { $loc = '3\' UTR'; }else { $loc = 'not within known transcription unit'; } } return $loc; } ####End sub loc_from_region { my $reg = shift @_; my $loc; if (!$reg) { return; } if ($reg =~ /^E\d+$/) { $loc = 'exon'; }elsif ($reg =~ /^I\d+$/) { $loc = 'intron'; }elsif ($reg =~ /5' UTR/) { $loc = '5\' UTR'; }elsif ($reg =~ /3' UTR/) { $loc = '3\' UTR'; } return $loc; } ####End #this parses a data line with fields #Mutation ID, Systematic Name, Mutation Name, Other Name, Region, Mutation Type sub parseDataLine { my $line = shift @_; my @f = split(/\t/, $line); #shift around file format 2 to match first part of file format 1 shift @f; #remove count shift @f; #remove reference ID splice(@f, 1, 1); #remove Nucleotide No. splice(@f, 2, 1); #remove Amino Acid #end of shifting list around my @chr; my $loc; $f[1] =~ s/^\s+//; $f[1] =~ s/\s+$//; if ($f[1] !~ /;/) { #check sequence based on hgvs style name my $ch = sequenceCheck($f[1]); if (!$ch) { print $bfh $line, "\n"; $bcnt++; return; }elsif ($ch == 2) { $verCnt++; } @chr = parse_hgvs_name($f[1]); if (@chr && $chr[0] !~ /ERROR/) { $loc = loc_from_region($f[4]); if (exists $pkey{"PAH_$f[1]"}) { print "WARNING duplicate ID PAH_$f[1]\n"; }else { $pkey{"PAH_$f[1]"} = 1; } if (!$loc) { $loc = loc_from_name($f[1]); } print $fh2 "$chr[0]\t$chr[1]\t$chr[2]\tPAH_$f[1]\t$strand\tU49897.1(PAH):$chr[3]\n"; print $fh "PAH_$f[1]\tPAH $chr[3]\t$srcId\t$chr[4]\t$loc\t$accuracy\n"; #print aliases to a file also $f[2] =~ s/^\s+//; $f[2] =~ s/\s+$//; $f[3] =~ s/^\s+//; $f[3] =~ s/\s+$//; $f[0] =~ s/^\s+//; $f[0] =~ s/\s+$//; print $fh3 "PAH_$f[1]\tcommonName\t$f[2]\n"; print $fh3 "PAH_$f[1]\talias\t$f[3]\n"; print $fh4 "PAH_$f[1]\tsrcLink\tPAHdb\t$f[0]\t\n"; if ($f[5] && $f[5] ne '') { $f[5] =~ s/^\s+//; $f[5] =~ s/\s+$//; print $fh3 "PAH_$f[1]\tmutType\t$f[5]\n"; #status based on type? check intersect with conserved later? if ($f[5] =~ /missense|nonsense|splice/i or ($f[5] =~ /deletion|insertion/i && $chr[4] eq 'exon')) { print $fh3 "PAH_$f[1]\tdisease\tlikely to be phenotype-associated\n"; } } if ($f[5] && $f[5] =~ /missense|nonsense/i) { my $t = $f[2]; $t =~ s/^p.//; print $fh3 "PAH_$f[1]\tprotEffect\t$t\n"; } #spID P00439 is PAH gene (omimID 261600) #provide link to OMIM gene print $fh4 "PAH_$f[1]\tgeneVarsDis\tomimTitle2\t261600\t\n"; $fcnt++; }else { print $bfh $line, "\n"; $bcnt++; } }else { #compound my @t = split(/;/, $f[1]); my $do_alias = 0; #only print once for gv file $loc = loc_from_region($f[4]); if (!$loc) { $loc = loc_from_name($f[1]); } if (exists $pkey{"PAH_$f[1]"}) { print "WARNING duplicate ID PAH_$f[1]\n"; }else { $pkey{"PAH_$f[1]"} = 1; } print $fh "PAH_$f[1]\t$f[1]\t$srcId\tcomplex\t$loc\t1\n"; foreach my $t (@t) { $t =~ s/\[//; #remove brackets if used $t =~ s/\]//; #check sequence based on hgvs style name my $ch = sequenceCheck($t); if (!$ch) { print $bfh $line, "\n"; $bcnt++; return; }elsif ($ch == 2) { $verCnt++; } @chr = parse_hgvs_name($t); if (!$accuracy) { #this will not match what was recorded above print STDERR "Warning accuracy set to 0 for compound mutation ($f[1])\n"; } if (@chr && $chr[0] !~ /ERROR/) { print $fh2 "$chr[0]\t$chr[1]\t$chr[2]\tPAH_$f[1]\t$strand\tU49897.1(PAH):c.[..$t..]\n"; #print aliases to a file also $do_alias = 1; }else { print $bfh $line, "\n"; #$do_alias = 0; $bcnt++; last; } } #only print aliases once if ($do_alias) { $fcnt++; if (!$f[2] or !$f[3] or !$f[0]) { die "ERROR in $f[1]\n"; } $f[2] =~ s/^\s+//; $f[2] =~ s/\s+$//; $f[3] =~ s/^\s+//; $f[3] =~ s/\s+$//; $f[0] =~ s/^\s+//; $f[0] =~ s/\s+$//; print $fh3 "PAH_$f[1]\tcommonName\t$f[2]\n"; print $fh3 "PAH_$f[1]\talias\t$f[3]\n"; print $fh4 "PAH_$f[1]\tsrcLink\tPAHdb\t$f[0]\t\n"; if ($f[5] && $f[5] ne '') { $f[5] =~ s/^\s+//; $f[5] =~ s/\s+$//; print $fh3 "PAH_$f[1]\tmutType\t$f[5]\n"; #status based on type? check intersect with conserved later? if ($f[5] =~ /missense|nonsense|splice/i or ($f[5] =~ /deletion|insertion/i && $chr[4] eq 'exon')) { print $fh3 "PAH_$f[1]\tdisease\tlikely to be phenotype-associated\n"; } } if ($f[5] && $f[5] =~ /missense|nonsense/) { print $fh3 "PAH_$f[1]\tprotEffect\t$f[2]\n"; } #spID P00439 is PAH gene (omimID 261600) #provide link to OMIM gene print $fh4 "PAH_$f[1]\tgeneVarsDis\tomimTitle2\t261600\t\n"; } } } ####End sub sequenceCheck { my $name = shift @_; my $verified = 0; my $bad; my $file = $faFile; if ($name =~ /(\d+[+-]\d+_*\d*[+-]*\d*)/) { #intron #change to genomic so can check my $pos = $1; my $st; my $end; if ($pos =~ /(\d+[+-]\d+)_(\d+[+-]\d+)/) { $st = $1; $end = $2; }else { $st = $pos; } $st =~ /(\d+)([+-]\d+)/; my $x = $1; my $off = $2; if ($off > 0) { $st = $genomicOffset + $intron{$x} + $off - 1; }else { $st = $genomicOffset + $intron{$x} + $off + 1; } if ($end) { $end =~ /(\d+)([+-]\d+)/; my $x = $1; my $off = $2; if ($off > 0) { $end = $genomicOffset + $intron{$x} + $off - 1; }else { $end = $genomicOffset + $intron{$x} + $off + 1; } $name =~ s/c\.$pos/g\.${st}_$end/; }else { $pos =~ s/\+/\\+/; #why do I need this here and not above? $name =~ s/c\.$pos/g\.$st/g; } $file = $genomicFile; }elsif ($name =~ /c\.(-\d+_*-*\d*)/) { #utr my $pos = $1; my $st = $pos; my $end; if ($pos =~ /(-\d+)_(-*\d+)/) { $end = $2; $st = $1; } $st = $st + $genomicOffset + 1; if ($end) { $end = $end + $genomicOffset + 1; $name =~ s/c\.$pos/g\.${st}_$end/; }else { $name =~ s/c\.$pos/g\.$st/; } $file = $genomicFile; } if ($name =~ /'/) { return undef; } #illegal char can't check open(CFH, "../sequenceCheck $file '$name' 2>&1 |") or die "Couldn't run sequenceCheck, $!\n"; my $test; while () { chomp; if (/Sequence doesn't match/) { $bad = $_; } elsif (/Sequence matched$/) { $verified++; } else { $test = $_; } } close(CFH) or die "Couldn't finish sequenceCheck with ../sequenceCheck $faFile '$name', $!:exit status $?\n"; if ($bad) { print $bfh "#$bad\n"; return 0; }elsif ($verified) { return 2; }else { #print $fh "#TESTING not checked sequence for $name got $test\n"; return 1; } } ####End #how to do links: #by id (per author?) #http://www.pahdb.mcgill.ca/cgi-bin/pahdb/pahdbsearch.cgi?Field=id_mut&OrderedField=nucl_no&Value=554&SortType=Asc&Search=Mutation&Go=1&ToShowFrom=0&F2S1=id_mut&F2S2=nucl_no&F2S3=syst_name&F2S5=amino_acid&F2S6=mut_name&F2S7=other_name&F2S17=Mutation.comment