HLA caller updated to examine class II loci, updated pointers to dictionary, allele frequencies.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3290 348d0f76-0448-11de-a6fe-93d51630548a
2010-05-03 14:54:52 +00:00 · 2010-05-03 14:54:52 +00:00 · 94b51de401
parent 97fdd92e7b
commit 94b51de401
9 changed files with 668 additions and 130 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java
@ -56,10 +56,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
    public String ethnicity = "CaucasianUSA";

-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
    String BlackAlleleFrequencyFile     = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
    String AlleleFrequencyFile          = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
-    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
+    String UniqueAllelesFile            = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
    Hashtable AlleleFrequencies,UniqueAlleles;
    
    CigarParser formatter = new CigarParser();
@ -95,6 +95,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
            UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());

+            //out.printf("INFO Common alleles:\n");
+            for (int i = 1; i < UniqueAlleles.size(); i++){
+                //out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
+            }
            out.printf("INFO  Reading HLA dictionary ...");


@ -188,12 +192,18 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
        
        for (int i = 0; i < numreads; i++){
            name1 = HLAnames[i].substring(4);
-            if (UniqueAlleles.containsKey(name1)){
+            String [] n1 = name1.split("\\*");
+//            out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
+            if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
+                //out.printf("1: %s\n",name1);
                //frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
                //if (frq1 > minfrq){
                    for (int j = i; j < numreads; j++){
                        name2 = HLAnames[j].substring(4);
-                        if (UniqueAlleles.containsKey(name2)){
+                        String [] n2 = name2.split("\\*");
+//                        out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
+                        if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
+                        
                //            frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
                //            if (frq2 > minfrq){
                                if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
@ -211,8 +221,8 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
                //}else{
                //    if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
                //}
-            }else{
-                if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
+            //}else{
+            //    if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
            }
        }
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java
@ -55,8 +55,8 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
    @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
    public int MINALLOWEDMISMATCHES = 7;

-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
    String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java
@ -67,12 +67,12 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer

    GATKArgumentCollection args = this.getToolkit().getArguments();
    
-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
    String BlackAlleleFrequencyFile     = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
    String AlleleFrequencyFile          = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
-    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
+    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";

-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
    SAMFileReader HLADictionaryReader = new SAMFileReader();
    boolean HLAdataLoaded = false;
    String[] HLAnames, HLAreads;
@ -216,7 +216,8 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
        out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
        for (int i = 0; i < HLAnames.length; i++){
            name1 = HLAnames[i].substring(4);
-            if (UniqueAlleles.containsKey(name1)){
+            String [] n1 = name1.split("\\*");
+            if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
                if (AlleleFrequencies.containsKey(name1)){
                    frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
                }else{
@ -225,19 +226,19 @@ public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer
                //if (frq1 > minfrq){
                    for (int j = i; j < HLAnames.length; j++){
                        name2 = HLAnames[j].substring(4);
-                        if (name1.substring(0,1).equals(name2.substring(0,1))){
-                            if (UniqueAlleles.containsKey(name2)){
-                                if (AlleleFrequencies.containsKey(name2)){
-                                    frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
-                                }else{
-                                    frq2 = .0001;
-                                }
-                //                if (frq2 > minfrq){
-                                    likelihood = CalculatePhaseLikelihood(i,j,false);
-                                    numCombinations++;
-                                    out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
-                //                }
+                        String [] n2 = name2.split("\\*");
+                        if (n1[0].equals(n2[0]) && UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
+                            if (AlleleFrequencies.containsKey(name2)){
+                                frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
+                            }else{
+                                frq2 = .0001;
                            }
+            //                if (frq2 > minfrq){
+                                likelihood = CalculatePhaseLikelihood(i,j,false);
+                                numCombinations++;
+                                out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
+            //                }
+
                        }
                    }
                //}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java
@ -0,0 +1,322 @@
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.commandline.Argument;
+
+import java.util.ArrayList;
+import java.util.Hashtable;
+/**
+ * Compares reads to longest read at each exon. Usage: java -jar GenomeAnalysisTK.jar -T ClusterReads -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta [-filter INPUT.filter] | grep -v INFO | sort -k1 > OUTPUT
+ * @author shermanjia
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
+    @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
+    public String filterFile = "";
+
+    @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
+    public int MINALLOWEDMISMATCHES = 5;
+
+    String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
+
+    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
+
+    boolean DatabaseLoaded = false;
+    boolean DEBUG = false;
+
+    Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
+    double[] SingleAlleleFrequencies;
+    ArrayList<String> ReadsToDiscard = new ArrayList<String>();
+    ArrayList<SAMRecord> AlignedReads = new ArrayList<SAMRecord>();
+
+    Hashtable MaxNumMatches, MaxConcordance, NumCompared = new Hashtable();
+
+    double[] nummatched, concordance, numcompared;
+
+    SAMRecord A2, A3, B2, B3, C2, C3;
+    int MaxMatchesA2 = 0, MaxMatchesA3 = 0, MaxMatchesB2 = 0, MaxMatchesB3 = 0, MaxMatchesC2 = 0, MaxMatchesC3 = 0;
+    int A2start = 30018513, A2stop = 30018781;
+    int A3start = 30019024, A3stop = 30019300;
+
+    int C2start = 31347355, C2stop = 31347623;
+    int C3start = 31346829, C3stop = 31347104;
+
+    int B2start = 31432444, B2stop = 31432714;
+    int B3start = 31431923, B3stop = 31432198;
+    
+    int minstartpos = 0;
+    int maxstoppos = 0;
+
+
+    Hashtable AlleleFrequencies = new Hashtable();
+    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
+    CigarParser formatter = new CigarParser();
+
+    public Integer reduceInit() {
+        if (!DatabaseLoaded){
+            DatabaseLoaded = true;
+
+            PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
+            siteFileReader.ReadFile(PolymorphicSitesFile);
+            PolymorphicSites = siteFileReader.GetPolymorphicSites();
+            NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
+
+
+            out.printf("INFO  %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
+
+            if (!filterFile.equals("")){
+                out.printf("INFO  Reading properties file ... ");
+                SimilarityFileReader similarityReader = new SimilarityFileReader();
+                similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
+                ReadsToDiscard = similarityReader.GetReadsToDiscard();
+                MaxNumMatches = similarityReader.GetNumMatches();
+                MaxConcordance = similarityReader.GetConcordance();
+
+                out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
+                for (int i = 0; i < ReadsToDiscard.size(); i++){
+                    out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString());
+                }
+            }
+
+            out.printf("INFO  Comparing reads ...\n");
+
+            if (DEBUG){
+                //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
+            }
+        }
+        return 0;
+    }
+
+    public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker tracker) {
+        //Calculate concordance for this read and all overlapping reads
+        if (!ReadsToDiscard.contains(read.getReadName())){
+            AlignedReads.add(read);
+            
+            int readstart = read.getAlignmentStart();
+            int readstop = read.getAlignmentEnd();
+            int length = readstop - readstart + 1;
+            if (MaxNumMatches.containsKey(read.getReadName())){
+                int maxMatches = Integer.parseInt(MaxNumMatches.get(read.getReadName()).toString());
+                double concordance = Double.parseDouble((String) MaxConcordance.get(read.getReadName()).toString());
+
+                if (readstart < A2stop && readstop > A2start){
+                    if (maxMatches > MaxMatchesA2 && concordance > 0.95){
+                        MaxMatchesA2 = maxMatches;
+                        A2 = read;
+                    }
+                } else if (readstart < A3stop && readstop > A3start){
+                    if (maxMatches > MaxMatchesA3){
+                        MaxMatchesA3 = maxMatches;
+                        A3 = read;
+                    }
+                } else if (readstart < B2stop && readstop > B2start){
+                    if (maxMatches > MaxMatchesB2){
+                        MaxMatchesB2 = maxMatches;
+                        B2 = read;
+                    }
+                } else if (readstart < B3stop && readstop > B3start){
+                    if (maxMatches > MaxMatchesB3){
+                        MaxMatchesB3 = maxMatches;
+                        B3 = read;
+                    }
+                } else if (readstart < C2stop && readstop > C2start){
+                    if (maxMatches > MaxMatchesC2){
+                        MaxMatchesC2 = maxMatches;
+                        C2 = read;
+                    }
+                } else if (readstart < C3stop && readstop > C3start){
+                    if (maxMatches > MaxMatchesC3){
+                        MaxMatchesC3 = maxMatches;
+                        C3 = read;
+                    }
+                }
+            }else{
+                out.printf("Data for %s not found\n",read.getReadName());
+            }
+        }
+        return 1;
+    }
+
+    public Integer reduce(Integer value, Integer sum) {
+        
+        return value + sum;
+    }
+
+    public void onTraversalDone(Integer numreads) {
+        SAMRecord read; String name, name2; String locus = "";
+        int A2a = 0, A2b = 0, A2c = 0;
+        int A3a = 0, A3b = 0, A3c = 0;
+        int B2a = 0, B2b = 0, B2c = 0;
+        int B3a = 0, B3b = 0, B3c = 0;
+        int C2a = 0, C2b = 0, C2c = 0;
+        int C3a = 0, C3b = 0, C3c = 0;
+        double minA2 = -1, minA3 = -1, minB2 = -1, minB3 = -1, minC2 = -1, minC3 = -1;
+        double maxA2 = 0, maxA3 = 0, maxB2 = 0, maxB3 = 0, maxC2 = 0, maxC3 = 0;
+        double ratioA2 = 0, ratioA3 = 0, ratioB2 = 0, ratioB3 = 0, ratioC2 = 0, ratioC3 = 0;
+        double maxA2l = 0, maxA3l = 0, maxB2l = 0, maxB3l = 0, maxC2l = 0, maxC3l = 0;
+        double maxA2d = 0, maxA3d = 0, maxB2d = 0, maxB3d = 0, maxC2d = 0, maxC3d = 0;
+                double a2, a3, b2, b3, c2, c3, normalized = 0;
+        int readstart, readstop;
+        double matches, compared, concordance;
+        STATS stats;
+
+        for (int i = 0; i < AlignedReads.size(); i++){
+            read = AlignedReads.get(i);
+            readstart = read.getAlignmentStart();
+            readstop = read.getAlignmentEnd();
+            if (readstart < A2stop && readstop > A2start){
+                stats = CalculateConcordance(read, A2, "A2", A2start, A2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
+                if (stats.getNumCompared() > 40){if (minA2 < 0 || minA2 > concordance){minA2 = concordance;}; if (concordance > maxA2){maxA2 = concordance;}; if (matches > maxA2l){maxA2l = matches;}; if (compared-matches > maxA2d){maxA2d = compared-matches;}}else{A2c++;}
+            } else if (readstart < A3stop && readstop > A3start){
+                stats = CalculateConcordance(read, A3, "A3", A3start, A3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
+                if (stats.getNumCompared() > 40){if (minA3 < 0 || minA3 > concordance){minA3 = concordance;}; if (concordance > maxA3){maxA3 = concordance;}; if (matches > maxA3l){maxA3l = matches;}; if (compared-matches > maxA3d){maxA3d = compared-matches;}}else{A3c++;}
+            } else if (readstart < B2stop && readstop > B2start){
+                stats = CalculateConcordance(read, B2, "B2", B2start, B2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
+                if (stats.getNumCompared() > 40){if (minB2 < 0 || minB2 > concordance){minB2 = concordance;}; if (concordance > maxB2){maxB2 = concordance;}; if (matches > maxB2l){maxB2l = matches;}; if (compared-matches > maxB2d){maxB2d = compared-matches;}}else{B2c++;}
+            } else if (readstart < B3stop && readstop > B3start){
+                stats = CalculateConcordance(read, B3, "B3", B3start, B3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
+                if (stats.getNumCompared() > 40){if (minB3 < 0 || minB3 > concordance){minB3 = concordance;}; if (concordance > maxB3){maxB3 = concordance;}; if (matches > maxB3l){maxB3l = matches;}; if (compared-matches > maxB3d){maxB3d = compared-matches;}}else{B3c++;}
+            } else if (readstart < C2stop && readstop > C2start){
+                stats = CalculateConcordance(read, C2, "C2", C2start, C2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
+                if (stats.getNumCompared() > 40){if (minC2 < 0 || minC2 > concordance){minC2 = concordance;}; if (concordance > maxC2){maxC2 = concordance;}; if (matches > maxC2l){maxC2l = matches;}; if (compared-matches > maxC2d){maxC2d = compared-matches;}}else{C2c++;}
+            } else if (readstart < C3stop && readstop > C3start){
+                stats = CalculateConcordance(read, C3, "C3", C3start, C3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared();
+                if (stats.getNumCompared() > 40){if (minC3 < 0 || minC3 > concordance){minC3 = concordance;}; if (concordance > maxC3){maxC3 = concordance;}; if (matches > maxC3l){maxC3l = matches;}; if (compared-matches > maxC3d){maxC3d = compared-matches;}}else{C3c++;}
+            }
+        }
+
+
+        for (int i = 0; i < AlignedReads.size(); i++){
+            read = AlignedReads.get(i);
+            readstart = read.getAlignmentStart();
+            readstop = read.getAlignmentEnd();
+            name = read.getReadName(); name2 = "";
+            if (NumCompared.containsKey(name)){
+                compared = Double.parseDouble((String) NumCompared.get(name).toString());
+                matches = Double.parseDouble((String) MaxNumMatches.get(name).toString());
+                concordance = Double.parseDouble((String) MaxConcordance.get(name).toString());
+                if (matches > 40){
+                    if (readstart < A2stop && readstop > A2start){
+                        locus = "A2"; name2 = A2.getReadName();
+                        a2 = (concordance - minA2)/(maxA2-minA2); if (a2 >= .5){A2a++;}else{A2b++;}; normalized = a2;
+                    } else if (readstart < A3stop && readstop > A3start){
+                        locus = "A3"; name2 = A3.getReadName();
+                        a3 = (concordance - minA3)/(maxA3-minA3); if (a3 >= .5){A3a++;}else{A3b++;}; normalized = a3;
+                    } else if (readstart < B2stop && readstop > B2start){
+                        locus = "B2"; name2 = B2.getReadName();
+                        b2 = (concordance - minB2)/(maxB2-minB2); if (b2 >= .5){B2a++;}else{B2b++;}; normalized = b2;
+                    } else if (readstart < B3stop && readstop > B3start){
+                        locus = "B3"; name2 = B3.getReadName();
+                        b3 = (concordance - minB3)/(maxB3-minB3); if (b3 >= .5){B3a++;}else{B3b++;}; normalized = b3;
+                    } else if (readstart < C2stop && readstop > C2start){
+                        locus = "C2"; name2 = C2.getReadName();
+                        c2 = (concordance - minC2)/(maxC2-minC2); if (c2 >= .5){C2a++;}else{C2b++;}; normalized = c2;
+                    } else if (readstart < C3stop && readstop > C3start){
+                        locus = "C3"; name2 = C3.getReadName();
+                        c3 = (concordance - minC3)/(maxC3-minC3); if (c3 >= .5){C3a++;}else{C3b++;}; normalized = c3;
+                    }
+                    out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\t%.3f\n",locus,name,name2,matches,compared,concordance,normalized);
+                }else{
+                    out.printf("%s (compared at %s sites) is too short\n",name,matches);
+                }
+            }else{
+                out.printf("%s [%s to %s] not found\n",name,readstart,readstop);
+            }
+        }
+
+        if (A2a > 0 && A2b > 0){if (A2a > A2b){ratioA2 = (double)A2b/(A2a+A2b);}else{ratioA2 = (double)A2a/(A2a+A2b);}}else{ratioA2 = -1;}
+        if (A3a > 0 && A3b > 0){if (A3a > A3b){ratioA3 = (double)A3b/(A3a+A3b);}else{ratioA3 = (double)A3a/(A3a+A3b);}}else{ratioA3 = -1;}
+        if (B2a > 0 && B2b > 0){if (B2a > B2b){ratioB2 = (double)B2b/(B2a+B2b);}else{ratioB2 = (double)B2a/(B2a+B2b);}}else{ratioB2 = -1;}
+        if (B3a > 0 && B3b > 0){if (B3a > B3b){ratioB3 = (double)B3b/(B3a+B3b);}else{ratioB3 = (double)B3a/(B3a+B3b);}}else{ratioB3 = -1;}
+        if (C2a > 0 && C2b > 0){if (C2a > C2b){ratioC2 = (double)C2b/(C2a+C2b);}else{ratioC2 = (double)C2a/(C2a+C2b);}}else{ratioC2 = -1;}
+        if (C3a > 0 && C3b > 0){if (C3a > C3b){ratioC3 = (double)C3b/(C3a+C3b);}else{ratioC3 = (double)C3a/(C3a+C3b);}}else{ratioC3 = -1;}
+
+        out.printf("RATIO_A2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA2,A2a,A2b,A2c,maxA2-minA2,maxA2l,maxA2d);
+        out.printf("RATIO_A3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA3,A3a,A3b,A3c,maxA3-minA3,maxA3l,maxA3d);
+        out.printf("RATIO_B2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB2,B2a,B2b,B2c,maxB2-minB2,maxB2l,maxB2d);
+        out.printf("RATIO_B3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB3,B3a,B3b,B3c,maxB3-minB3,maxB3l,maxB3d);
+        out.printf("RATIO_C2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC2,C2a,C2b,C2c,maxC2-minC2,maxC2l,maxC2d);
+        out.printf("RATIO_C3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC3,C3a,C3b,C3c,maxC3-minC3,maxC3l,maxC3d);
+
+    }
+
+    public class STATS {
+       protected double concordance = 0.0;
+       protected double numcompared = 0;
+       protected double nummatches = 0;
+
+       public STATS(double d, double i, double m) {
+          concordance = d;
+          numcompared = i;
+          nummatches = m;
+       }
+
+       public double getConcordance() {
+          return concordance;
+       }
+
+       public double getNumCompared() {
+          return numcompared;
+       }
+
+       public double getNumMatches() {
+          return nummatches;
+       }
+    }
+
+    private STATS CalculateConcordance(SAMRecord read1, SAMRecord read2, String locus, int start, int stop){
+        int start1 = read1.getAlignmentStart(), stop1 = read1.getAlignmentEnd();
+        int start2 = read2.getAlignmentStart(), stop2 = read2.getAlignmentEnd();
+
+        int pos;
+        double numcompared = 0, nummatched = 0, concordance;
+        char c1, c2;
+        String s1 = formatter.FormatRead(read1.getCigarString(), read1.getReadString());
+        String s2 = formatter.FormatRead(read2.getCigarString(), read2.getReadString());
+
+
+        //Polymorphic sites: always increment denominator, increment numerator when bases are concordant
+        for (int j = 0; j < PolymorphicSites.length; j++){
+            pos = PolymorphicSites[j];
+            if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2 && pos >= start && pos <= stop){
+                c1 = s1.charAt(pos-start1);
+                c2 = s2.charAt(pos-start2);
+                if (c1 != 'D'){//allow for deletions (sequencing errors)
+                    numcompared++;
+                    if (c1 == c2){
+                        nummatched++;
+                    }
+                }
+            }
+        }
+
+        //Non-polymorphic sites: increment denominator only when bases are discordant
+        if (false){
+            for (int j = 0; j < NonPolymorphicSites.length; j++){
+                pos = NonPolymorphicSites[j];
+                if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2){
+                    c1 = s1.charAt(pos-start1);
+                    c2 = s2.charAt(pos-start2);
+                    if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
+                        numcompared++;
+                    }
+                }
+            }
+        }
+            
+        //Update concordance array
+        concordance=nummatched/numcompared;
+
+        MaxNumMatches.put(read1.getReadName(), nummatched);
+        NumCompared.put(read1.getReadName(), numcompared);
+        MaxConcordance.put(read1.getReadName(), concordance);
+        //out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\n",locus,read1.getReadName(),read2.getReadName(),nummatched,numcompared,concordance);
+
+        return new STATS(concordance, numcompared, nummatched);
+    }
+
+}
+
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java
@ -52,6 +52,12 @@ public class CreatePedFileWalker extends ReadWalker<Integer, Integer> {
    @Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false)
    public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt";

+    @Argument(fullName = "PrintDNA", shortName = "PrintDNA", doc = "Print DNA sequences", required = false)
+    public boolean PrintDNA = false;
+
+    @Argument(fullName = "PrintAA", shortName = "PrintAA", doc = "Print Amino Acid sequences", required = false)
+    public boolean PrintAA = true;
+    
    String[] HLAnames, HLAreads, inputFileContents;
    Integer[] HLAstartpos, HLAstoppos;
    ArrayList<String> HLAnamesAL, HLAreadsAL;
@ -408,23 +414,26 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2
                

                if (true) {
-                    error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
-                    error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
-                    error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
-                    error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
-                    error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
-                    error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
-                    error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
-                    error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
-
-                    error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
-                    error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
+                    if (PrintDNA){
+                        error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end);
+                        error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end);
+                        error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end);
+                        error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end);
+                        error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end);
+                        error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end);
+                        error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end);
+                        error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end);
+                    }
+                    if (PrintAA){
+                        error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons);
+                        error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons);
+                    }
                    out.printf("\n");
                    out.printf("%s",error);
                }
@ -433,23 +442,27 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2

        //Prints SNP names for each site
        if (true){
-            PrintSNPS(HLA_A_start,HLA_A_end);
-            PrintSNPS(HLA_C_start,HLA_C_end);
-            PrintSNPS(HLA_B_start,HLA_B_end);
-            PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
-            PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
-            PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
-            PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
-            PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
+            if (PrintDNA){
+                PrintSNPS(HLA_A_start,HLA_A_end);
+                PrintSNPS(HLA_C_start,HLA_C_end);
+                PrintSNPS(HLA_B_start,HLA_B_end);
+                PrintSNPS(HLA_DRB1_start,HLA_DRB1_end);
+                PrintSNPS(HLA_DQA1_start,HLA_DQA1_end);
+                PrintSNPS(HLA_DQB1_start,HLA_DQB1_end);
+                PrintSNPS(HLA_DPA1_start,HLA_DPA1_end);
+                PrintSNPS(HLA_DPB1_start,HLA_DPB1_end);
+            }

-            PrintAminoAcidSites(A_exons,"A",true);
-            PrintAminoAcidSites(C_exons,"C",false);
-            PrintAminoAcidSites(B_exons,"B",false);
-            PrintAminoAcidSites(DRB1_exons,"DRB1",false);
-            PrintAminoAcidSites(DQA1_exons,"DQA1",true);
-            PrintAminoAcidSites(DQB1_exons,"DQB1",false);
-            PrintAminoAcidSites(DPA1_exons,"DPA1",false);
-            PrintAminoAcidSites(DPB1_exons,"DPB1",true);
+            if (PrintAA){
+                PrintAminoAcidSites(A_exons,"A",true);
+                PrintAminoAcidSites(C_exons,"C",false);
+                PrintAminoAcidSites(B_exons,"B",false);
+                PrintAminoAcidSites(DRB1_exons,"DRB1",false);
+                PrintAminoAcidSites(DQA1_exons,"DQA1",true);
+                PrintAminoAcidSites(DQB1_exons,"DQB1",false);
+                PrintAminoAcidSites(DPA1_exons,"DPA1",false);
+                PrintAminoAcidSites(DPB1_exons,"DPB1",true);
+            }
        }

    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
@ -43,6 +43,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {

    @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
    public boolean findFirst = false;
+
+    @Argument(fullName = "DEBUG", shortName = "DEBUG", doc = "Debug walker", required = false)
+    public boolean debug = false;
    
    @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
    public String debugAllele = "";
@ -50,16 +53,18 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
    public String ethnicity = "Caucasian";

+    @Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
+    public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
+
    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
    public boolean ONLYFREQUENT = false;

-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
    SAMFileReader HLADictionaryReader = new SAMFileReader();

-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
    String AlleleFrequencyFile;
-    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
+    String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";

    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";

@ -74,6 +79,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
    int numHLAlleles = 0;
    int minstartpos = 0;
    int maxstoppos = 0;
+    int numpolymorphicsites = 0, numnonpolymorphicsites = 0, pos =0;

    int HLA_A_start = 30018310;
    int HLA_A_end = 30021211;
@ -89,7 +95,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
            //Load HLA dictionary
            out.printf("INFO  Loading HLA dictionary ... ");

-            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLADictionaryReader.ReadFile(HLAdictionaryFile);
            HLAreads = HLADictionaryReader.GetReads();
            HLAnames = HLADictionaryReader.GetReadNames();
            HLAstartpos = HLADictionaryReader.GetStartPositions();
@ -121,7 +127,8 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
            siteFileReader.ReadFile(PolymorphicSitesFile);
            PolymorphicSites = siteFileReader.GetPolymorphicSites();
            NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
-            
+            numpolymorphicsites = PolymorphicSites.length;
+            numnonpolymorphicsites = NonPolymorphicSites.length;
            
            out.printf("INFO  %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
            out.printf("INFO  Comparing reads to database ...\n");
@ -133,53 +140,15 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
        return 0;
    }

-    private void FindPolymorphicSites(int start, int stop){
-        boolean initialized, polymorphic, examined;
-        char c = ' ';
-        ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
-        ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
-        //Find polymorphic sites in dictionary
-        for (int pos = start; pos <= stop; pos++){
-            initialized = false; polymorphic = false; examined = false;
-            //look across all alleles at specific position to see if it is polymorphic
-            for (int i = 0; i < HLAreads.length; i++){
-                if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
-                    if (!initialized){
-                        c = HLAreads[i].charAt(pos-HLAstartpos[i]);
-                        initialized = true;
-                        examined = true;
-                    }
-                    if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
-                        polymorphicsites.add(pos);
-                        out.printf("POLYMORPHIC\t6\t%s\n", pos);
-                        polymorphic = true;
-                        break;
-                    }
-                }
-
-            }
-            if (!polymorphic && examined){
-                nonpolymorphicsites.add(pos);
-                out.printf("CONSERVED\t6\t%s\n", pos);
-            }
-        }
-        PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
-        NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
-    }
-
    private double CalculateConcordance(SAMRecord read){
        int readstart = read.getAlignmentStart();
        int readstop = read.getAlignmentEnd();
-        int numpolymorphicsites, numnonpolymorphicsites, pos;
        char c1, c2;
        double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
        String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
        String s2;
        int allelestart, allelestop;

-        numpolymorphicsites = PolymorphicSites.length;
-        numnonpolymorphicsites = NonPolymorphicSites.length;
-
        if (ONLYFREQUENT){
            minFreq = 0.0001;
        }
@ -200,7 +169,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
                    if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
                        c1 = s1.charAt(pos-readstart);
                        c2 = s2.charAt(pos-allelestart);
-                        if (c1 != 'D'){//allow for deletions (sequencing errors)
+                        if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
                            numcompared[i]++;
                            if (c1 == c2){
                                nummatched[i]++;
@ -214,16 +183,17 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
                }

                //Non-polymorphic sites: increment denominator only when bases are discordant
-                
-                for (int j = 0; j < numnonpolymorphicsites; j++){
-                    pos = NonPolymorphicSites[j];
-                    if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
-                        c1 = s1.charAt(pos-readstart);
-                        c2 = s2.charAt(pos-allelestart);
-                        if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors)
-                            numcompared[i]++;
-                            if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
-                                out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
+                if (numcompared[i] > 0){
+                    for (int j = 0; j < numnonpolymorphicsites; j++){
+                        pos = NonPolymorphicSites[j];
+                        if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
+                            c1 = s1.charAt(pos-readstart);
+                            c2 = s2.charAt(pos-allelestart);
+                            if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
+                                numcompared[i]++;
+                                if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
+                                    out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
+                                }
                            }
                        }
                    }
@ -259,26 +229,32 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {

    public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
        //Calculate concordance for this read and all overlapping reads
-        double maxConcordance = CalculateConcordance(read);
+        if (read.getMappingQuality() > 0){
+            double maxConcordance = CalculateConcordance(read);

-        String readname = read.getReadName(), allelename = ""; double freq;
-        //For input bam files that contain HLA alleles, find and print allele frequency
-        freq = GetAlleleFrequency(readname);
-        out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());
+            if (maxConcordance > 0){
+                String readname = read.getReadName(), allelename = ""; double freq;
+                //For input bam files that contain HLA alleles, find and print allele frequency
+                //freq = GetAlleleFrequency(readname);
+                out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());

-        //Find the maximum frequency of the alleles most concordant with the read
-        double maxFreq = FindMaxAlleleFrequency(maxConcordance);
-        
-        //Print concordance statistics between this read and the most similar HLA allele(s)
-        for (int i = 0; i < HLAreads.length; i++){
-            if (concordance[i] == maxConcordance && maxConcordance > 0){
-                freq = GetAlleleFrequency(HLAnames[i]);
-                if (freq == maxFreq){
-                    out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
+                //Find the maximum frequency of the alleles most concordant with the read
+                //double maxFreq = FindMaxAlleleFrequency(maxConcordance);
+
+                //Print concordance statistics between this read and the most similar HLA allele(s)
+
+                for (int i = 0; i < HLAreads.length; i++){
+                    if (concordance[i] == maxConcordance){
+                        freq = GetAlleleFrequency(HLAnames[i]);
+                        //if (freq == maxFreq){
+                            out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
+                        //}
+                        break;
+                    }
                }
+                out.print("\n");
            }
        }
-        out.print("\n");
        return 1;
    }

--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java
@ -0,0 +1,152 @@
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.commandline.Argument;
+
+import java.util.ArrayList;
+import java.util.Hashtable;
+/**
+ * Finds polymorphic sites in the HLA dictionary. Usage: java -jar GenomeAnalysisTK.jar -T FindPolymorphicSites -I HLA_DICTIONARY.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT
+ * @author shermanjia
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
+    @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
+    public String debugRead = "";
+
+    @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
+    public boolean findFirst = false;
+
+    @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
+    public String debugAllele = "";
+
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "Caucasian";
+
+    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
+    public boolean ONLYFREQUENT = false;
+
+    //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
+    
+    SAMFileReader HLADictionaryReader = new SAMFileReader();
+
+    //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
+    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
+    String AlleleFrequencyFile;
+    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
+
+    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
+
+    boolean DatabaseLoaded = false;
+    boolean DEBUG = false;
+
+    String[] HLAnames, HLAreads;
+    Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
+    double[] SingleAlleleFrequencies;
+
+    double[] nummatched, concordance, numcompared;
+    int numHLAlleles = 0;
+    int minstartpos = 0;
+    int maxstoppos = 0;
+
+    int HLA_A_start = 30018310;
+    int HLA_A_end = 30021211;
+
+    Hashtable AlleleFrequencies = new Hashtable();
+    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
+    CigarParser formatter = new CigarParser();
+
+    public Integer reduceInit() {
+        if (!DatabaseLoaded){
+            DatabaseLoaded = true;
+
+            //Load HLA dictionary
+            out.printf("INFO  Loading HLA dictionary ... ");
+
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetReads();
+            HLAnames = HLADictionaryReader.GetReadNames();
+            HLAstartpos = HLADictionaryReader.GetStartPositions();
+            HLAstoppos = HLADictionaryReader.GetStopPositions();
+            minstartpos = HLADictionaryReader.GetMinStartPos();
+            maxstoppos = HLADictionaryReader.GetMaxStopPos();
+
+            out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
+
+            nummatched = new double[HLAreads.length];
+            concordance = new double[HLAreads.length];
+            numcompared = new double[HLAreads.length];
+
+            //Read allele frequencies
+            if (ethnicity.equals("Black")){
+                AlleleFrequencyFile = BlackAlleleFrequencyFile;
+            }else{
+                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
+            }
+            out.printf("INFO  Reading HLA allele frequencies ... ");
+            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
+            HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
+            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
+            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
+
+            FindPolymorphicSites(minstartpos,maxstoppos);
+
+            out.printf("INFO  %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
+            out.printf("INFO  Comparing reads to database ...\n");
+
+            if (DEBUG){
+                //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
+            }
+        }
+        return 0;
+    }
+
+    private void FindPolymorphicSites(int start, int stop){
+        boolean initialized, polymorphic, examined;
+        char c = ' ';
+        ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
+        ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
+        //Find polymorphic sites in dictionary
+        for (int pos = start; pos <= stop; pos++){
+            initialized = false; polymorphic = false; examined = false;
+            //look across all alleles at specific position to see if it is polymorphic
+            for (int i = 0; i < HLAreads.length; i++){
+                if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
+                    if (!initialized){
+                        c = HLAreads[i].charAt(pos-HLAstartpos[i]);
+                        initialized = true;
+                        examined = true;
+                    }
+                    if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
+                        polymorphicsites.add(pos);
+                        out.printf("POLYMORPHIC\t6\t%s\n", pos);
+                        polymorphic = true;
+                        break;
+                    }
+                }
+
+            }
+            if (!polymorphic && examined){
+                nonpolymorphicsites.add(pos);
+                out.printf("CONSERVED\t6\t%s\n", pos);
+            }
+        }
+        PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
+        NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
+    }
+
+    public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+        //Calculate concordance for this read and all overlapping reads
+        return 1;
+    }
+
+
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java
@ -16,8 +16,9 @@ import java.util.Hashtable;
 */
@Requires({DataSource.READS, DataSource.REFERENCE})
 public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
-    String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.closest";
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
+//    String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS1.closest";
+    String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS2.closest";

    boolean DatabaseLoaded = false;
    boolean DEBUG = false;
@ -40,14 +41,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
    int HLA_B_end = 31432914;
    int HLA_C_start = 31344925;
    int HLA_C_end = 31347827;
+    int HLA_DQA1_start = 32713161;
+    int HLA_DQA1_end = 32719407;
+    int HLA_DQB1_start = 32735635;
+    int HLA_DQB1_end = 32742444;
+    int HLA_DPA1_start = 33140772;
+    int HLA_DPA1_end = 33149356;
+    int HLA_DPB1_start = 33151738;
+    int HLA_DPB1_end = 33162954;
+    int HLA_DRB1_start = 32654525;
+    int HLA_DRB1_end = 32665540;


    ArrayList<String> PolymorphicSites = new ArrayList<String>();

    Hashtable ClosestAllele = new Hashtable();
-    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
+    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1, iDRBstart = -1, iDRBstop = -1, iDQAstart = -1, iDQAstop = -1, iDQBstart = -1, iDQBstop = -1, iDPAstart = -1, iDPAstop = -1, iDPBstart = -1, iDPBstop = -1;
    CigarParser formatter = new CigarParser();
-
+    
    public Integer reduceInit() {
    if (!DatabaseLoaded){
            try{
@ -77,6 +88,21 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
                        }else if (s[0].indexOf("HLA_C") > -1){
                            if (iCstart < 0){iCstart=i;}
                            iCstop = i; i++;
+                        }else if (s[0].indexOf("HLA_DRB1") > -1){
+                            if (iDRBstart < 0){iDRBstart=i;}
+                            iDRBstop = i; i++;
+                        }else if (s[0].indexOf("HLA_DQA1") > -1){
+                            if (iDQAstart < 0){iDQAstart=i;}
+                            iDQAstop = i; i++;
+                        }else if (s[0].indexOf("HLA_DQB1") > -1){
+                            if (iDQBstart < 0){iDQBstart=i;}
+                            iDQBstop = i; i++;
+                        }else if (s[0].indexOf("HLA_DPA1") > -1){
+                            if (iDPAstart < 0){iDPAstart=i;}
+                            iDPAstop = i; i++;
+                        }else if (s[0].indexOf("HLA_DPB1") > -1){
+                            if (iDPBstart < 0){iDPBstart=i;}
+                            iDPBstop = i; i++;
                        }
                    }
                }
@ -146,10 +172,12 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
        int numM = 0, numI = 0, numD = 0;

        name = read.getReadName();
+        
        String matchedAllele = (String) ClosestAllele.get(name);

        //out.printf("%s\t%s\n",name,matchedAllele);
        int index = HLAnames.indexOf(matchedAllele);
+        
        String matchedRead = HLAreads.get(index);

        if (name.indexOf("HLA_A") > -1){
@ -161,8 +189,24 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
        } else if (name.indexOf("HLA_C") > -1){
            startimputation = HLA_C_start;
            stopimputation = HLA_C_end;
+        } else if (name.indexOf("HLA_DRB1") > -1){
+            startimputation = HLA_DRB1_start;
+            stopimputation = HLA_DRB1_end;
+        } else if (name.indexOf("HLA_DQA1") > -1){
+            startimputation = HLA_DQA1_start;
+            stopimputation = HLA_DQA1_end;
+        } else if (name.indexOf("HLA_DQB1") > -1){
+            startimputation = HLA_DQB1_start;
+            stopimputation = HLA_DQB1_end;
+        } else if (name.indexOf("HLA_DPA1") > -1){
+            startimputation = HLA_DPA1_start;
+            stopimputation = HLA_DPA1_end;
+        } else if (name.indexOf("HLA_DPB1") > -1){
+            startimputation = HLA_DPB1_start;
+            stopimputation = HLA_DPB1_end;
        }

+        //out.printf("DEBUG %s\t%s\t%s\t%s\t%s\n",name,matchedAllele,index,startimputation,stopimputation);
        for (int i = startimputation; i <= stopimputation; i++){
            //if position is within read
            if (i >= readstart && i <= readstop){
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java
@ -7,12 +7,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;

 import java.io.*;
 import java.util.ArrayList;
+import java.util.Hashtable;
 /**
 *
 * @author shermanjia
 */
 public class SimilarityFileReader {
    ArrayList<String> ReadsToDiscard = new ArrayList<String>();
+    Hashtable Concordance = new Hashtable();
+    Hashtable NumMatches = new Hashtable();
+    Hashtable NumMismatches = new Hashtable();

    public ArrayList<String> GetReadsToDiscard(){
        return ReadsToDiscard;
@ -22,6 +26,18 @@ public class SimilarityFileReader {
        return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
    }

+    public Hashtable GetConcordance(){
+        return Concordance;
+    }
+
+    public Hashtable GetNumMatches(){
+        return NumMatches;
+    }
+
+    public Hashtable GetNumMismatches(){
+        return NumMismatches;
+    }
+
    public void ReadFile(String filename, int minAllowedMismatches){
        try{
            FileInputStream fstream = new FileInputStream(filename);
@ -35,6 +51,10 @@ public class SimilarityFileReader {
                if (s.length >= 6){
                    Double matchFraction = Double.valueOf(s[4]);
                    int numMismatches = Integer.valueOf(s[6]);
+
+                    Concordance.put(s[0],matchFraction);
+                    NumMatches.put(s[0], s[5]);
+                    NumMismatches.put(s[0], numMismatches);
                    if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
                        ReadsToDiscard.add(s[0]);
                    }