diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index f6707ea9b..f67eeb9a7 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -8,6 +8,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.ReadBackedPileup; @@ -37,141 +38,134 @@ public class CallHLAWalker extends LocusWalker>{ int[] HLAstoppos; int numHLAlleles = 0; double[][] Prob; String[][] Alleles; + int j1, k1, j2, k2; Hashtable Scores = new Hashtable(); - //HLAreads.add("Italian Riviera"); + boolean DatabaseLoaded = false; + boolean PrintedOutput = false; public Pair reduceInit() { - try{ - out.printf("Reading HLA database ..."); - FileInputStream fstream = new FileInputStream(HLAdatabaseFile); - // Get the object of DataInputStream - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - if (s.length>=10){ - HLAreads.add(CigarFormatted(s[5],s[9])); - HLAcigars.add(s[5]); - HLAnames.add(s[0]); - HLApositions.add(s[3]); - //System.out.println (s[0] + "\t" + s[3]); + //Load sequences corresponding to HLA alleles from sam file + if (!DatabaseLoaded){ + try{ + out.printf("Reading HLA database ..."); + FileInputStream fstream = new FileInputStream(HLAdatabaseFile); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + s = strLine.split("\\t"); + if (s.length>=10){ + //Parse the reads with cigar parser + HLAreads.add(CigarFormatted(s[5],s[9])); + HLAcigars.add(s[5]); + HLAnames.add(s[0]); + HLApositions.add(s[3]); + } } - } - in.close(); - int n = HLApositions.size(); - numHLAlleles = n; - HLAstartpos = new int[n]; HLAstoppos = new int[n]; - Prob = new double[n][n]; - for (int i = 0; i < n; i++){ - HLAstartpos[i]=Integer.parseInt(HLApositions.get(i)); - HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1; - for (int j = 0; j (0l,0l); } private String CigarFormatted(String cigar, String read){ - String formattedRead = ""; - char c; - String count; + // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur + String formattedRead = ""; char c; String count; int cigarPlaceholder = 0; int subcigarLength = 0; int readPlaceholder = 0; int subreadLength = 0; - //out.printf("%s\n",cigar); + + //reads cigar string for (int i = 0; i < cigar.length(); i++){ c = cigar.charAt(i); if (c == 'M'){ + //If reach M for match/mismatch, get number immediately preceeding 'M' and tack on that many characters to sequence subcigarLength = i-cigarPlaceholder; count = cigar.substring(cigarPlaceholder, i); - //read.substring(readPlaceholder, subreadLength) - subreadLength = Integer.parseInt(count); formattedRead = formattedRead + read.substring(readPlaceholder, readPlaceholder+subreadLength); - //out.printf("%sM,%s,%s\n", count,readPlaceholder,readPlaceholder+subreadLength); + //increment placeholders cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - + readPlaceholder = readPlaceholder + subreadLength; } else if (c == 'I'){ + //***NOTE: To be modified later if needed (insertions removed here)*** + + //If reaches I for insertion, get number before 'I' and skip that many characters in sequence count = cigar.substring(cigarPlaceholder, i); subreadLength = Integer.parseInt(count); - //formattedRead = formattedRead + read.substring(readPlaceholder, subreadLength); - //out.printf("%sI,%s,%s\n", count,readPlaceholder,readPlaceholder+subreadLength); + + //increment placeholders without adding inserted bases to sequence (effectively removes insertion). cigarPlaceholder = i+1; - readPlaceholder = readPlaceholder + subreadLength; - + readPlaceholder = readPlaceholder + subreadLength; } else if (c == 'D'){ + //If reaches D for deletion, insert 'D' into sequence as placeholder count = cigar.substring(cigarPlaceholder, i); subreadLength = Integer.parseInt(count); + + //Add one 'D' for each deleted base String deletion = ""; for (int j = 1; j <= subreadLength; j++){ deletion = deletion + "D"; } - //out.printf("%sD,%s,%s\n", count,readPlaceholder,readPlaceholder+subreadLength); + + //update placeholders formattedRead = formattedRead + deletion; cigarPlaceholder = i+1; - //readPlaceholder = readPlaceholder + subreadLength; - } } - //out.printf("%s\n",formattedRead); - return formattedRead; } + private static int unsignedByteToInt(byte b) { + //converts base quality from byte to int (not really needed) + return (int) b & 0xFF; + } + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - //Create new context with mapping quality >= 20 List reads = context.getReads(); List offsets = context.getOffsets(); GenomeLoc Gloc = context.getLocation(); - //List newreads = Arrays.asList(); - //List newoffsets = Arrays.asList(); - //remove reads with mapping quality < 20 - - //for (int i = 0; i < reads.size(); i++){ - //out.printf("i=%s,s=%s",i,reads.size()); - //SAMRecord r = reads.get(i); - //if (r.getMappingQuality() < 20){ - //out.printf(",%s removed",i); - //reads.remove(i); - //offsets.remove(i); - //i--; - //} - //out.printf("\n"); - //} - - //reset context to include only high quality reads - context = new AlignmentContext(Gloc,reads,offsets); - - //Create pileup of reads at this locus using filtered context + //Create pileup of reads at this locus ReadBackedPileup pileup = new ReadBackedPileup(ref.getBase(), context); - String bases = pileup.getBases(); - long loc = context.getPosition(); if( context.getReads().size() > 0 ) { @@ -183,104 +177,105 @@ public class CallHLAWalker extends LocusWalker>{ String c1 = ""; String c2 = ""; long pos_k = 0; long pos_j = 0; - for (int i = 0;i < context.getReads().size();i++){ - //out.printf("%s%s ",i,bases.charAt(i)); - char base = bases.charAt(i); - if (base == 'A'){numAs++;} - if (base == 'C'){numCs++;} - if (base == 'T'){numTs++;} - if (base == 'G'){numGs++;} - - } + //Debugging purposes: print location, reference base, pileup, and count (before quality filtering) //out.printf("%s\t", context.getLocation()); - //out.printf("ref=%s\t", ref); - //out.printf("%sAs\t%sCs\t%sTs\t%sGs\t",numAs,numCs,numTs,numGs); - int j1 = 80, k1 = 96, j2 = 80, k2 = 114; + //out.printf("ref=%s\t", ref.getBase()); + //out.printf("%s\t",pileup.getBases().toString()); + //out.printf("%s\t",pileup.getBasePileupAsCountsString()); //Calculate posterior probabilities! - // todo -- note this now uses a flat prior -- rather than a reference biased prior NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(); - // todo -- note I've removed the Q20 restriction -- the genotyper takes care of this automatically - G.add(pileup); + //I've tried simply adding the entire pileup to G, but quality scores are not checked, and 'N' bases throw an error + //(NewHotnessGenotypeLikelihoods potentially has bug on line 57) + //G.add(pileup); - //Store confidence scores - // Todo -- there's no need for the hash table now -- you can just use getLikeliood and the DiploidGenotype interface + //Check for bad bases and ensure mapping quality myself. This works. + for (int i = 0; i < context.getReads().size(); i++) { + SAMRecord read = context.getReads().get(i); + int offset = context.getOffsets().get(i); + char base = read.getReadString().charAt(offset); + byte qual = read.getBaseQualities()[offset]; + int mapquality = read.getMappingQuality(); + if (mapquality >= 20 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { + if (base == 'A'){numAs++;} + if (base == 'C'){numCs++;} + if (base == 'T'){numTs++;} + if (base == 'G'){numGs++;} + //consider base in likelihood calculations if it looks good and has high mapping score + G.add(base, qual); + } + } + + //Debugging purposes + //out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs); + + + //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype Scores = new Hashtable(); for ( DiploidGenotype g : DiploidGenotype.values() ) { Scores.put(g.toString(), G.getLikelihood(g)); } - //Update probabilities for combinations of alleles - //For each HLA allele + //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0. + String homref = String.valueOf(ref.getBase())+String.valueOf(ref.getBase()); + Double homreflikelihood = Double.parseDouble((String) Scores.get(homref).toString()); + + + //Update likelihood for each combinations of alleles + Double likelihood = 0.0; String r1 = "", r2 = "", s1 = "", s2 = ""; - j1 = 80; k1 = 96; j2 = 80; k2 = 114; for (int j = 0; j < numHLAlleles; j++){ - //out.print(j+","+loc + "," + HLAstartpos[j] + "," + HLAstoppos[j]); - //check if allele overlaps current position + + //check if allele 1 overlaps current position if (loc >= HLAstartpos[j] && loc <= HLAstoppos[j]){ pos_j = loc - HLAstartpos[j]; c1 = Character.toString(Character.toUpperCase(HLAreads.get(j).charAt((int) pos_j))); - if (j == j1){ //C*010201 - r1 = c1; - } - if (j == k1){ //C*070101 - r2 = c1; - } - if(j == j2){ //C*010201 - s1 = c1; - } - if (j == k2){ //C*150201 - s2 = c1; - } + //Extract bases for HLA alleles indicated in reduceInit (for debugging) + if (j == j1) r1 = c1; + if (j == k1) r2 = c1; + if (j == j2) s1 = c1; + if (j == k2) s2 = c1; + //Updade likelihoods for (int k = 0; k < numHLAlleles; k++){ - //out.print(k+","+loc + "," + HLAstartpos[j] + "," + HLAstoppos[j]); + + //check if allele 2 overlaps current position if (loc >= HLAstartpos[k] && loc <= HLAstoppos[k]){ pos_k = loc - HLAstartpos[k]; c2 = Character.toString(Character.toUpperCase(HLAreads.get(k).charAt((int) pos_k))); - //out.printf("j=%s,k=%s,C1=%s,C2=%s,",j,k,c1,c2); + + //updates likelihoods for both permutations of the alleles, normalized to the likelihood for homozygous reference + if (Scores.containsKey(c1 + c2)){ - String base=c1 + c2; - Prob[j][k]= Prob[j][k]+Double.parseDouble((String) Scores.get(base)); - //out.printf("P(%s)=%s\t",base,Prob[j][k]); + + likelihood = Double.parseDouble((String) Scores.get(c1 + c2).toString()); + Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood; + }else if(Scores.containsKey(c2 + c1)){ - String base=c2 + c1; - Prob[j][k]= Prob[j][k]+Double.parseDouble((String) Scores.get(base)); - //out.printf("P(%s)=%s\t",base,Prob[j][k]); + + likelihood = Double.parseDouble((String) Scores.get(c2 + c1).toString()); + Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood; + } - } } - } } - //out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]); - //out.printf("%s %5.0f\t",base,score); - if ( !suppressPrinting ){ - } - - //out.printf("\t"); - //mGenotypes.get(0).getConfidenceScore(). - //out.printf("RG for first read: %s%n",context.getReads().get(0).getReadName()); + //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes + //out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]); + for ( DiploidGenotype g : DiploidGenotype.values() ) { + //out.printf("%s %5.0f\t",g.toString(),likelihood - homreflikelihood); + } + //out.printf("\n"); + } } + return context.getReads().size(); - - } - - private double[] priorsArray(String priorsString) { - String[] pstrs = priorsString.split(","); - double[] pdbls = new double[pstrs.length]; - - for (int i = 0; i < pstrs.length; i++) { - pdbls[i] = Double.valueOf(pstrs[i]); - } - - return pdbls; } public boolean isReduceByInterval() { @@ -294,27 +289,55 @@ public class CallHLAWalker extends LocusWalker>{ } public void onTraversalDone(Pair result) { - double max = 0; int max_i =0; int max_j = 0; - double max_2 = 0; int max_i_2 =0; int max_j_2 = 0; - double max_3 = 0; int max_i_3 =0; int max_j_3 = 0; - out.printf("\n"); - for (int i = 0; i < numHLAlleles; i++){ - for (int j = 0; j < numHLAlleles; j++){ - if (i >= j ){ - out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),Prob[i][j]); - } - if (Prob[i][j] > max){ - max_3 = max_2; max_i_3 = max_i_2; max_j_3 = max_j_2; - max_2 = max; max_i_2 = max_i; max_j_2 = max_j; - max = Prob[i][j]; max_i = i; max_j = j; + //Print HLA allele combinations with highest likelihood sums + if (!PrintedOutput){ + double max = 0; int i_max =0; int j_max = 0; + double m2 = 0; int i_max_2 =0; int j_max_2 = 0; + double m3 = 0; int i_max_3 =0; int j_max_3 = 0; + + out.printf("\n"); + + //Find max scores. + for (int i = 0; i < numHLAlleles; i++){ + for (int j = 0; j < numHLAlleles; j++){ + if (i >= j){ + //Print likelihoods for all alleles + out.printf("%s\t%s\t%5.0f",HLAnames.get(i),HLAnames.get(j),Prob[i][j]); + if (Prob[i][j] > max){ + m3 = m2; i_max_3 = i_max_2; j_max_3 = j_max_2; + m2 = max; i_max_2 = i_max; j_max_2 = j_max; + max = Prob[i][j]; i_max = i; j_max = j; + } + } } } + + //Highest likelihoods + for (int i = 0; i < numHLAlleles; i++){ + for (int j = 0; j < numHLAlleles; j++){ + if (i >= j){ + if (Prob[i][j] == max){ + out.printf("Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",max,HLAnames.get(i),HLAnames.get(j),i,j); + + } + } + } + } + + //2nd Highest likelihoods + for (int i = 0; i < numHLAlleles; i++){ + for (int j = 0; j < numHLAlleles; j++){ + if (i >= j){ + if (Prob[i][j] == m2){ + out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",m2,HLAnames.get(i),HLAnames.get(j),i,j); + } + } + } + } + + PrintedOutput = true; + //out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); } - out.printf("\n"); - out.printf("Highest score: %5.0f in %s and %s\n",max,HLAnames.get(max_i),HLAnames.get(max_j)); - out.printf("2nd Highest score: %5.0f in %s and %s\n",max_2,HLAnames.get(max_i_2),HLAnames.get(max_j_2)); - out.printf("3rd Highest score: %5.0f in %s and %s\n",max_3,HLAnames.get(max_i_3),HLAnames.get(max_j_3)); - //out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond()); } }