Fixed bug: previously reset likelihoods after each region/exon.

Better comments/documentation added git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1386 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-05 18:44:46 +00:00 · 2009-08-05 18:44:46 +00:00 · d60d5aa516
parent 0d47798721
commit d60d5aa516
1 changed files with 190 additions and 167 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java
@ -8,6 +8,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.cmdLine.Argument;
 import org.broadinstitute.sting.utils.Pair;
 import org.broadinstitute.sting.utils.ReadBackedPileup;
@ -37,141 +38,134 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
    int[] HLAstoppos;
    int numHLAlleles = 0;
    double[][] Prob; String[][] Alleles;
+    int j1, k1, j2, k2;

    Hashtable Scores = new Hashtable();

-    //HLAreads.add("Italian Riviera");
+    boolean DatabaseLoaded = false;
+    boolean PrintedOutput = false;

    public Pair<Long, Long> reduceInit() {
-        try{
-            out.printf("Reading HLA database ...");
-            FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
-            // Get the object of DataInputStream
-            DataInputStream in = new DataInputStream(fstream);
-            BufferedReader br = new BufferedReader(new InputStreamReader(in));
-            String strLine; String [] s = null;
-            //Read File Line By Line
-            while ((strLine = br.readLine()) != null)   {
-                s = strLine.split("\\t");
-                if (s.length>=10){
-                    HLAreads.add(CigarFormatted(s[5],s[9]));
-                    HLAcigars.add(s[5]);
-                    HLAnames.add(s[0]);
-                    HLApositions.add(s[3]);
-                    //System.out.println (s[0] + "\t" + s[3]);
+        //Load sequences corresponding to HLA alleles from sam file
+        if (!DatabaseLoaded){
+            try{
+                out.printf("Reading HLA database ...");
+                FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
+                DataInputStream in = new DataInputStream(fstream);
+                BufferedReader br = new BufferedReader(new InputStreamReader(in));
+                String strLine; String [] s = null;
+                //Read File Line By Line
+                while ((strLine = br.readLine()) != null)   {
+                    s = strLine.split("\\t");
+                    if (s.length>=10){
+                        //Parse the reads with cigar parser
+                        HLAreads.add(CigarFormatted(s[5],s[9]));
+                        HLAcigars.add(s[5]);
+                        HLAnames.add(s[0]);
+                        HLApositions.add(s[3]);
+                    }
                }
-            }
-            in.close();
-            int n = HLApositions.size();
-            numHLAlleles = n;
-            HLAstartpos = new int[n]; HLAstoppos = new int[n];
-            Prob = new double[n][n];
-            for (int i = 0; i < n; i++){
-                HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
-                HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
-                for (int j = 0; j <n; j++){
-                    Prob[i][j]=0;
-                }
-                //System.out.println (HLAstartpos[i] + " to " + HLAstoppos[i]);
-            }
-            out.printf("DONE!\n");
-        }catch (Exception e){//Catch exception if any
-          System.err.println("Error: " + e.getMessage());
-        }
+                in.close();
+                int n = HLApositions.size(); numHLAlleles = n;
+                HLAstartpos = new int[n]; HLAstoppos = new int[n];
+                Prob = new double[n][n];

-        //out.printf("%s,%s\n",HLAnames.get(22),HLAnames.get(68));
-        //String read = HLAreads.get(22);
-        //String cigar = HLAcigars.get(22);
-        //out.printf("%s,%s\n",cigar,read);
-        //HLAreads.set(22, CigarFormatted(cigar,read));
-        //HLAreads.set(68, CigarFormatted(HLAcigars.get(68),HLAreads.get(68)));
-        out.printf("Comparing reads to database ...\n");
-        int j1 = 80, k1 = 96, j2 = 80, k2 = 114;
-        out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2));
+                for (int i = 0; i < n; i++){
+                    //Find start and stop positions for each allele
+                    HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
+                    HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
+                    //Initialize matrix of probabilities / likelihoods
+                    for (int j = 0; j <n; j++){
+                        Prob[i][j]=0;
+                    }
+                    //For debugging: get index for specific alleles
+                    if (HLAnames.get(i).equals("HLA_A*310102"))
+                        j1 = i;
+                    if (HLAnames.get(i).equals("HLA_A*320101"))
+                        k1 = i;
+                    if (HLAnames.get(i).equals("HLA_A*330301"))
+                        j2 = i;
+                    if (HLAnames.get(i).equals("HLA_A*0265"))
+                        k2 = i;
+                }
+                out.printf("DONE!\n");
+            }catch (Exception e){//Catch exception if any
+              System.err.println("Error: " + e.getMessage());
+            }
+
+            DatabaseLoaded = true;
+            out.printf("Comparing reads to database ...\n");
+
+            //For debugging: prints which HLA alleles were indexed before
+            //out.printf("%s,%s\t%s,%s\n",HLAnames.get(j1),HLAnames.get(k1),HLAnames.get(j2),HLAnames.get(k2));
+        }
        return new Pair<Long,Long>(0l,0l);
    }

    private String CigarFormatted(String cigar, String read){
-        String formattedRead = "";
-        char c;
-        String count;
+        // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur
+        String formattedRead = ""; char c; String count;
        int cigarPlaceholder = 0; int subcigarLength = 0;
        int readPlaceholder = 0; int subreadLength = 0;
-        //out.printf("%s\n",cigar);
+        
+        //reads cigar string
        for (int i = 0; i < cigar.length(); i++){
            c = cigar.charAt(i);
            if (c == 'M'){
+                //If reach M for match/mismatch, get number immediately preceeding 'M' and tack on that many characters to sequence
                subcigarLength = i-cigarPlaceholder;
                count = cigar.substring(cigarPlaceholder, i);
                
-                //read.substring(readPlaceholder, subreadLength)
-                
                subreadLength = Integer.parseInt(count);
                formattedRead = formattedRead + read.substring(readPlaceholder, readPlaceholder+subreadLength);

-                //out.printf("%sM,%s,%s\n", count,readPlaceholder,readPlaceholder+subreadLength);
+                //increment placeholders
                cigarPlaceholder = i+1;
-                readPlaceholder = readPlaceholder + subreadLength;
-                
+                readPlaceholder = readPlaceholder + subreadLength;                
            } else if (c == 'I'){
+                //***NOTE: To be modified later if needed (insertions removed here)***
+
+                //If reaches I for insertion, get number before 'I' and skip that many characters in sequence
                count = cigar.substring(cigarPlaceholder, i);
                subreadLength = Integer.parseInt(count);
-                //formattedRead = formattedRead + read.substring(readPlaceholder, subreadLength);
-                //out.printf("%sI,%s,%s\n", count,readPlaceholder,readPlaceholder+subreadLength);
+
+                //increment placeholders without adding inserted bases to sequence (effectively removes insertion).
                cigarPlaceholder = i+1;
-                readPlaceholder = readPlaceholder + subreadLength;
-                
+                readPlaceholder = readPlaceholder + subreadLength;               
            } else if (c == 'D'){
+                //If reaches D for deletion, insert 'D' into sequence as placeholder
                count = cigar.substring(cigarPlaceholder, i);
                subreadLength = Integer.parseInt(count);
+
+                //Add one 'D' for each deleted base
                String deletion = "";
                for (int j = 1; j <= subreadLength; j++){
                    deletion = deletion + "D";
                }
-                //out.printf("%sD,%s,%s\n", count,readPlaceholder,readPlaceholder+subreadLength);
+
+                //update placeholders
                formattedRead = formattedRead + deletion;
                cigarPlaceholder = i+1;
-                //readPlaceholder = readPlaceholder + subreadLength;
-                
            }

        }
-        //out.printf("%s\n",formattedRead);
-        
        return formattedRead;
    }

+    private static int unsignedByteToInt(byte b) {
+        //converts base quality from byte to int (not really needed)
+        return (int) b & 0xFF;
+    }
+
    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        
-        //Create new context with mapping quality >= 20
        List<SAMRecord> reads = context.getReads();
        List<Integer> offsets = context.getOffsets();
        GenomeLoc Gloc = context.getLocation();
-        //List<SAMRecord> newreads = Arrays.asList();
-        //List<Integer> newoffsets = Arrays.asList();

-        //remove reads with mapping quality < 20
-
-        //for (int i = 0; i < reads.size(); i++){
-            //out.printf("i=%s,s=%s",i,reads.size());
-            //SAMRecord r = reads.get(i);
-            //if (r.getMappingQuality() < 20){
-                //out.printf(",%s removed",i);
-                //reads.remove(i);
-                //offsets.remove(i);
-                //i--;
-            //}
-            //out.printf("\n");
-        //}
-        
-        //reset context to include only high quality reads
-        context = new AlignmentContext(Gloc,reads,offsets);
-
-        //Create pileup of reads at this locus using filtered context
+        //Create pileup of reads at this locus
        ReadBackedPileup pileup = new ReadBackedPileup(ref.getBase(), context);

-        String bases = pileup.getBases();
-
        long loc = context.getPosition();

        if( context.getReads().size() > 0 ) {
@ -183,104 +177,105 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
            String c1 = ""; String c2 = "";
            long pos_k = 0; long pos_j = 0;
            
-            for (int i = 0;i < context.getReads().size();i++){
-                //out.printf("%s%s ",i,bases.charAt(i));
-                char base = bases.charAt(i);
-                if (base == 'A'){numAs++;}
-                if (base == 'C'){numCs++;}
-                if (base == 'T'){numTs++;}
-                if (base == 'G'){numGs++;}
-
-            }
+            //Debugging purposes: print location, reference base, pileup, and count (before quality filtering)
            //out.printf("%s\t", context.getLocation());
-            //out.printf("ref=%s\t", ref);
-            //out.printf("%sAs\t%sCs\t%sTs\t%sGs\t",numAs,numCs,numTs,numGs);
-            int j1 = 80, k1 = 96, j2 = 80, k2 = 114;
+            //out.printf("ref=%s\t", ref.getBase());
+            //out.printf("%s\t",pileup.getBases().toString());
+            //out.printf("%s\t",pileup.getBasePileupAsCountsString());


            //Calculate posterior probabilities!
-            // todo -- note this now uses a flat prior -- rather than a reference biased prior
            NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods();

-            // todo -- note I've removed the Q20 restriction -- the genotyper takes care of this automatically
-            G.add(pileup);
+            //I've tried simply adding the entire pileup to G, but quality scores are not checked, and 'N' bases throw an error
+            //(NewHotnessGenotypeLikelihoods potentially has bug on line 57)
+            //G.add(pileup);

-            //Store confidence scores
-            // Todo -- there's no need for the hash table now -- you can just use getLikeliood and the DiploidGenotype interface
+            //Check for bad bases and ensure mapping quality myself. This works.
+            for (int i = 0; i < context.getReads().size(); i++) {
+                SAMRecord read = context.getReads().get(i);
+                int offset = context.getOffsets().get(i);
+                char base = read.getReadString().charAt(offset);
+                byte qual = read.getBaseQualities()[offset];
+                int mapquality = read.getMappingQuality();
+                if (mapquality >= 20 && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
+                    if (base == 'A'){numAs++;}
+                    if (base == 'C'){numCs++;}
+                    if (base == 'T'){numTs++;}
+                    if (base == 'G'){numGs++;}
+                    //consider base in likelihood calculations if it looks good and has high mapping score
+                    G.add(base, qual);
+                }
+            }
+
+            //Debugging purposes
+            //out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs);
+
+
+            //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype
            Scores = new Hashtable();
            for ( DiploidGenotype g : DiploidGenotype.values() ) {
                Scores.put(g.toString(), G.getLikelihood(g));
            }

-            //Update probabilities for combinations of alleles
-            //For each HLA allele
+            //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0.
+            String homref = String.valueOf(ref.getBase())+String.valueOf(ref.getBase());
+            Double homreflikelihood = Double.parseDouble((String) Scores.get(homref).toString());
+
+
+            //Update likelihood for each combinations of alleles
+            Double likelihood = 0.0;
            String r1 = "", r2 = "", s1 = "", s2 = "";
-            j1 = 80; k1 = 96; j2 = 80; k2 = 114;
            for (int j = 0; j < numHLAlleles; j++){
-                //out.print(j+","+loc + "," + HLAstartpos[j] + "," + HLAstoppos[j]);
-                //check if allele overlaps current position
+
+                //check if allele 1 overlaps current position
                if (loc >= HLAstartpos[j] && loc <= HLAstoppos[j]){
                    pos_j = loc - HLAstartpos[j];
                    c1 = Character.toString(Character.toUpperCase(HLAreads.get(j).charAt((int) pos_j)));

-                    if (j == j1){ //C*010201
-                        r1 = c1;
-                    }
-                    if (j == k1){ //C*070101
-                        r2 = c1;
-                    }
-                    if(j == j2){ //C*010201
-                        s1 = c1;
-                    }
-                    if (j == k2){ //C*150201
-                        s2 = c1;
-                    }
+                    //Extract bases for HLA alleles indicated in reduceInit (for debugging)
+                    if (j == j1) r1 = c1;
+                    if (j == k1) r2 = c1;
+                    if (j == j2) s1 = c1;
+                    if (j == k2) s2 = c1;

+                    //Updade likelihoods
                    for (int k = 0; k < numHLAlleles; k++){
-                        //out.print(k+","+loc + "," + HLAstartpos[j] + "," + HLAstoppos[j]);
+
+                        //check if allele 2 overlaps current position
                        if (loc >= HLAstartpos[k] && loc <= HLAstoppos[k]){
                            pos_k = loc - HLAstartpos[k];
                            c2 = Character.toString(Character.toUpperCase(HLAreads.get(k).charAt((int) pos_k)));
-                            //out.printf("j=%s,k=%s,C1=%s,C2=%s,",j,k,c1,c2);
+
+                            //updates likelihoods for both permutations of the alleles, normalized to the likelihood for homozygous reference
+                            
                            if (Scores.containsKey(c1 + c2)){
-                                String base=c1 + c2;
-                                Prob[j][k]= Prob[j][k]+Double.parseDouble((String) Scores.get(base));
-                                //out.printf("P(%s)=%s\t",base,Prob[j][k]);
+
+                                likelihood = Double.parseDouble((String) Scores.get(c1 + c2).toString());
+                                Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood;
+
                            }else if(Scores.containsKey(c2 + c1)){
-                                String base=c2 + c1;
-                                Prob[j][k]= Prob[j][k]+Double.parseDouble((String) Scores.get(base));
-                                //out.printf("P(%s)=%s\t",base,Prob[j][k]);
+
+                                likelihood = Double.parseDouble((String) Scores.get(c2 + c1).toString());
+                                Prob[j][k]= Prob[j][k]+ likelihood - homreflikelihood;
+
                            }
-                        
                        }
                    }
-
                }
            }

-            //out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]);
-            //out.printf("%s %5.0f\t",base,score);
-
            if ( !suppressPrinting ){
-            }
-           
-            //out.printf("\t");
-            //mGenotypes.get(0).getConfidenceScore().
-            //out.printf("RG for first read: %s%n",context.getReads().get(0).getReadName());
+                //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes
+                //out.printf("%s%s:%5.0f\t%s%s:%5.0f\t",r1,r2,Prob[j1][k1],s1,s2,Prob[j2][k2]);
+                for ( DiploidGenotype g : DiploidGenotype.values() ) {
+                    //out.printf("%s %5.0f\t",g.toString(),likelihood - homreflikelihood);
+                }
+                //out.printf("\n");
+            }           
        }
+        
        return context.getReads().size();
-
-    }
-
-    private double[] priorsArray(String priorsString) {
-        String[] pstrs = priorsString.split(",");
-        double[] pdbls = new double[pstrs.length];
-
-        for (int i = 0; i < pstrs.length; i++) {
-            pdbls[i] = Double.valueOf(pstrs[i]);
-        }
-
-        return pdbls;
    }

    public boolean isReduceByInterval() {
@ -294,27 +289,55 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
    }

    public void onTraversalDone(Pair<Long, Long> result) {
-        double max = 0; int max_i =0; int max_j = 0;
-        double max_2 = 0; int max_i_2 =0; int max_j_2 = 0;
-        double max_3 = 0; int max_i_3 =0; int max_j_3 = 0;

-        out.printf("\n");
-        for (int i = 0; i < numHLAlleles; i++){
-            for (int j = 0; j < numHLAlleles; j++){
-                if (i >= j ){
-                    out.printf("%s\t%s\t%5.0f\n",HLAnames.get(i),HLAnames.get(j),Prob[i][j]);
-                }
-                if (Prob[i][j] > max){
-                    max_3 = max_2; max_i_3 = max_i_2; max_j_3 = max_j_2;
-                    max_2 = max;   max_i_2 = max_i;   max_j_2 = max_j;
-                    max = Prob[i][j]; max_i = i; max_j = j;
+        //Print HLA allele combinations with highest likelihood sums
+        if (!PrintedOutput){
+            double max = 0; int i_max =0; int j_max = 0;
+            double m2 = 0; int i_max_2 =0; int j_max_2 = 0;
+            double m3 = 0; int i_max_3 =0; int j_max_3 = 0;
+
+            out.printf("\n");
+
+            //Find max scores.
+            for (int i = 0; i < numHLAlleles; i++){
+                for (int j = 0; j < numHLAlleles; j++){
+                    if (i >= j){
+                        //Print likelihoods for all alleles
+                        out.printf("%s\t%s\t%5.0f",HLAnames.get(i),HLAnames.get(j),Prob[i][j]);
+                        if (Prob[i][j] > max){
+                            m3 = m2;  i_max_3 = i_max_2; j_max_3 = j_max_2;
+                            m2 = max; i_max_2 = i_max;   j_max_2 = j_max;
+                            max = Prob[i][j]; i_max = i; j_max = j;
+                        }
+                    }
                }
            }
+
+            //Highest likelihoods
+            for (int i = 0; i < numHLAlleles; i++){
+                for (int j = 0; j < numHLAlleles; j++){
+                    if (i >= j){
+                        if (Prob[i][j] == max){
+                            out.printf("Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",max,HLAnames.get(i),HLAnames.get(j),i,j);
+
+                        }
+                    }
+                }
+            }
+
+            //2nd Highest likelihoods
+            for (int i = 0; i < numHLAlleles; i++){
+                for (int j = 0; j < numHLAlleles; j++){
+                    if (i >= j){
+                        if (Prob[i][j] == m2){
+                            out.printf("2nd Highest likelihood: %5.0f in %s and %s; i=%s, j=%s\n",m2,HLAnames.get(i),HLAnames.get(j),i,j);
+                        }
+                    }
+                }
+            }
+
+            PrintedOutput = true;
+            //out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond());
        }
-        out.printf("\n");
-        out.printf("Highest score: %5.0f in %s and %s\n",max,HLAnames.get(max_i),HLAnames.get(max_j));
-        out.printf("2nd Highest score: %5.0f in %s and %s\n",max_2,HLAnames.get(max_i_2),HLAnames.get(max_j_2));
-        out.printf("3rd Highest score: %5.0f in %s and %s\n",max_3,HLAnames.get(max_i_3),HLAnames.get(max_j_3));
-        //out.printf("Average depth of coverage is: %.2f in %d total coverage over %d sites\n",((double)result.getFirst() / (double)result.getSecond()), result.getFirst(), result.getSecond());
    }
 }