HLA caller updated - now searches all (common and rare) alleles, more efficient read filtering and allele comparison runs.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3543 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-14 15:14:40 +00:00 · 2010-06-14 15:14:40 +00:00 · 5704294f9d
parent d51e6c45a7
commit 5704294f9d
9 changed files with 1122 additions and 184 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java
@ -33,7 +33,9 @@ import org.broadinstitute.sting.commandline.Argument;

 import java.util.ArrayList;
 import java.util.Hashtable;
-
+import java.util.Enumeration;
+import java.util.Vector;
+import java.util.Collections;

 /**
 * Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\
@ -60,17 +62,20 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
    String BlackAlleleFrequencyFile     = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
    String AlleleFrequencyFile          = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
-    String UniqueAllelesFile            = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAllelesCommon";
-    Hashtable AlleleFrequencies,UniqueAlleles;
+    String UniqueAllelesFile            = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles4Digit";
+    String HLAdatabaseFile              = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.txt";
+    String HLA2DigitFile              = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY_2DIGIT.txt";
+
+    Hashtable AlleleFrequencies,UniqueAlleles,Alleles2Digit;
    
    CigarParser formatter = new CigarParser();
    double[][] baseLikelihoods;
    int[] positions;
    boolean loaded = false;

-    String[] HLAnames, HLAreads;
-    Integer[] HLAstartpos, HLAstoppos;
-    ArrayList<String> HLAnamesAL, HLAreadsAL;
+    String[] HLAnames, HLAreads, HLAnames2, HLAreads2;
+    Integer[] HLAstartpos, HLAstoppos, HLAstartpos2, HLAstoppos2;
+    ArrayList<String> HLAnamesAL, HLAreadsAL, Loci, AllelesToSearch;
    ArrayList<Integer> HLAstartposAL, HLAstopposAL;

    public Integer reduceInit() {
@ -86,21 +91,27 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
            HLAstartposAL = new ArrayList<Integer>();
            HLAstopposAL = new ArrayList<Integer>();

-            if (!ethnicity.equals("CaucasianUSA")){
-                AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
-            }
-            out.printf("INFO  Reading HLA allele frequencies ... ");
-            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
-            HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
-            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
-            UniqueAlleles = HLAfreqReader.GetUniqueAlleles();
-            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
+            out.printf("INFO  Reading HLA alleles ... ");
+            HLAFileReader HLADictionaryReader = new HLAFileReader();
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetSequences();
+            HLAnames = HLADictionaryReader.GetNames();
+            HLAstartpos = HLADictionaryReader.GetStartPositions();
+            HLAstoppos = HLADictionaryReader.GetStopPositions();
+
+            HLADictionaryReader = new HLAFileReader();
+            HLADictionaryReader.ReadFile(HLA2DigitFile);
+            HLAreads2 = HLADictionaryReader.GetSequences();
+            HLAnames2 = HLADictionaryReader.GetNames();
+            HLAstartpos2 = HLADictionaryReader.GetStartPositions();
+            HLAstoppos2 = HLADictionaryReader.GetStopPositions();
+            out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);

            //out.printf("INFO Common alleles:\n");
            for (int i = 1; i < UniqueAlleles.size(); i++){
                //out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]);
            }
-            out.printf("INFO  Reading HLA dictionary ...");
+            //out.printf("INFO  Reading HLA dictionary ...");


        }
@ -108,10 +119,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
    }

    public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
-        HLAnamesAL.add(read.getReadName());
-        HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
-        HLAstartposAL.add(read.getAlignmentStart());
-        HLAstopposAL.add(read.getAlignmentEnd());
+        //HLAnamesAL.add(read.getReadName());
+        //HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
+        //HLAstartposAL.add(read.getAlignmentStart());
+        //HLAstopposAL.add(read.getAlignmentEnd());
+        //out.printf("INFO\t%s\t%s\t%s\t%s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),formatter.FormatRead(read.getCigarString(), read.getReadString()));
        return 1;
    }

@ -154,11 +166,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
    }

    public void onTraversalDone(Integer numreads) {
-        out.printf("Done! %s alleles found\n", numreads);
-        HLAnames = HLAnamesAL.toArray(new String[numreads]);
-        HLAreads = HLAreadsAL.toArray(new String[numreads]);
-        HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
-        HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
+        //out.printf("Done! %s alleles found\n", numreads);
+        //HLAnames = HLAnamesAL.toArray(new String[numreads]);
+        //HLAreads = HLAreadsAL.toArray(new String[numreads]);
+        //HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
+        //HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);

        double[][] AlleleLikelihoods = new double[numreads][numreads];

@ -173,6 +185,7 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
        int numcombinations = 0;
        out.printf("NUM\tAllele1\tAllele2\tSSG\n");

+        //debugging specific alleles
        int index1 = -1, index2 = -1;
        if (!debugAlleles.equals("")){
            String s[] = debugAlleles.split(",");
@ -185,52 +198,96 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Intege
                }
                if (index1 > -1 && index2 > -1){
                    out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
-                    double dl = CalculateLikelihood(index1,index2,true);
+                    double dl = CalculateLikelihood(index1,index2,HLAreads2,true);
                    break;
                }
            }
        }
-        
+
+        //Pre-process homozygous combinations to determine top possible alleles (for efficiency)
+        int numreads2 = HLAnames2.length;
+        Alleles2Digit = new Hashtable();
+        Loci = new ArrayList<String>();
+        double[] AlleleLikelihoods2 = new double[numreads];
        for (int i = 0; i < numreads; i++){
            name1 = HLAnames[i].substring(4);
            String [] n1 = name1.split("\\*");
-//            out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3));
-            if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){
-                //out.printf("1: %s\n",name1);
-                //frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
-                //if (frq1 > minfrq){
+            numcombinations++;
+            AlleleLikelihoods2[i] = CalculateLikelihood(i,i,HLAreads,false);
+            if (AlleleLikelihoods2[i] < 0){
+                name2 = n1[0] + "*" + n1[1].substring(0, 2);
+                if (!Loci.contains(n1[0])){Loci.add(n1[0]);}
+                if (!Alleles2Digit.containsKey(name2)){
+                    Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
+                }else if ((Double) Alleles2Digit.get(name2) < AlleleLikelihoods2[i]){
+                    Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
+                }
+            }
+        }
+
+        //Sort alleles at 2 digit resolution for each locus
+        AllelesToSearch = new ArrayList<String>();
+        for (int i = 0; i < Loci.size(); i++){
+            Enumeration k = Alleles2Digit.keys();
+            Hashtable AllelesAtLoci = new Hashtable();
+
+            //find alleles at the locus
+            while( k.hasMoreElements() ){
+                name1 = k.nextElement().toString();
+                String [] n1 = name1.split("\\*");
+                if (Loci.get(i).equals(n1[0])){
+                    AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
+                }
+            }
+
+            //Sort alleles at locus, mark top six 2-digit classes for deep search
+            int num = 1;
+            Vector v = new Vector(AllelesAtLoci.keySet());
+            Collections.sort(v);
+            for (Enumeration e = v.elements(); e.hasMoreElements();) {
+                Double key = Double.valueOf(e.nextElement().toString());
+                String allele = AllelesAtLoci.get(key).toString();
+                if (num <= 10){
+                    AllelesToSearch.add(allele);
+                    
+                    num++;
+                }
+                //out.printf("%s\t%s\n",allele,key);
+            }
+        }
+
+        //Iterate through allele pairs to calculate likelihoods
+        if (true){
+            numcombinations = 0;
+            for (int i = 0; i < numreads; i++){
+                name1 = HLAnames[i].substring(4);
+                String [] n1 = name1.split("\\*");
+                if (AllelesToSearch.contains(n1[0] + "*" + n1[1].substring(0, 2))){
+                    //out.printf("1: %s\n",name1);
+                    //frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
+                    //if (frq1 > minfrq){
                    for (int j = i; j < numreads; j++){
                        name2 = HLAnames[j].substring(4);
                        String [] n2 = name2.split("\\*");
-//                        out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3));
-                        if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){
-                        
-                //            frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
-                //            if (frq2 > minfrq){
-                                if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
-                                    numcombinations++;
-                                    AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,false);
+                        if (AllelesToSearch.contains(n2[0] + "*" + n2[1].substring(0, 2))){
+                            if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
+                                numcombinations++;
+                                AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,HLAreads,false);
+                                if (AlleleLikelihoods[i][j] < 0){
                                    out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
                                }
-                //            }else{
-                //                if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);}
-                //            }
-                //        }else{
-                //            if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);}
+                            }
                        }
                    }
-                //}else{
-                //    if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
-                //}
-            //}else{
-            //    if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
+                }
            }
        }
    }

-    private double CalculateLikelihood(int a1, int a2, boolean debug){
-        String read1 = HLAreads[a1];
-        String read2 = HLAreads[a2];
+    private double CalculateLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
+        //Calculates likelihood for specific allele pair
+        String read1 = HLAalleles[a1];
+        String read2 = HLAalleles[a2];
        int start1 = HLAstartpos[a1];
        int start2 = HLAstartpos[a2];
        int stop1 = HLAstoppos[a1];
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java
@ -72,34 +72,13 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
    ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
    ArrayList<String> AllReadNames = new ArrayList<String>();

-    boolean HLAdataLoaded = false;
+    boolean dataLoaded = false;


-    //Loads HLA dictionary, allele frequencies, and reads to filter
+    //Loads reads to filter
    public Pair<Long, Long> reduceInit() {
-        if (!HLAdataLoaded){
-            HLAdataLoaded = true;
-            
-            out.printf("INFO  Reading HLA database ... ");
-            HLADictionaryReader.ReadFile(HLAdatabaseFile);
-            HLAreads = HLADictionaryReader.GetReads();
-            HLAnames = HLADictionaryReader.GetReadNames();
-            HLAstartpos = HLADictionaryReader.GetStartPositions();
-            HLAstoppos = HLADictionaryReader.GetStopPositions();
-            InitializeVariables(HLAreads.length);
-            out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
-
-
-
-            if (!ethnicity.equals("CaucasianUSA")){
-                AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq";
-            }
-            out.printf("INFO  Reading HLA allele frequencies ... ");
-            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
-            HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
-            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
-            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
-
+        if (!dataLoaded){
+            dataLoaded = true;
            
            if (!filterFile.equals("")){
                out.printf("INFO  Reading properties file ... ");
@ -137,7 +116,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo

            int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
            //if (DEBUG){
-                out.printf("%s\t%s\t", context.getLocation(),ref.getBase());
+                out.printf("%s\t%s\t", context.getLocation(),(char)ref.getBase());
            //}

            //Calculate posterior probabilities
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
@ -53,24 +53,25 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
    
    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
    public String ethnicity = "Caucasian";
+    
+    @Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals", required = false)
+    public String intervalFile = "";

    @Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false)
    public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";

    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
    public boolean ONLYFREQUENT = false;
+    
+    String HLAdatabaseFile              = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
+    String PolymorphicSitesFile         = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
+    String AlleleFrequencyFile          = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";

-    SAMFileReader HLADictionaryReader = new SAMFileReader();
-
-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
-    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
-    String AlleleFrequencyFile;
-    String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
-
-    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
+    HLAFileReader HLADictionaryReader = new HLAFileReader();    

    boolean DatabaseLoaded = false;
    boolean DEBUG = false;
+    ArrayList<String> ClosestAlleles = new ArrayList<String>();

    String[] HLAnames, HLAreads;
    Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
@ -88,6 +89,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
    Hashtable AlleleFrequencies = new Hashtable();
    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
    CigarParser formatter = new CigarParser();
+    int [][] intervals; int numIntervals;

    public Integer reduceInit() { 
        if (!DatabaseLoaded){
@ -96,9 +98,9 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
            //Load HLA dictionary
            out.printf("INFO  Loading HLA dictionary ... ");

-            HLADictionaryReader.ReadFile(HLAdictionaryFile);
-            HLAreads = HLADictionaryReader.GetReads();
-            HLAnames = HLADictionaryReader.GetReadNames();
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetSequences();
+            HLAnames = HLADictionaryReader.GetNames();
            HLAstartpos = HLADictionaryReader.GetStartPositions();
            HLAstoppos = HLADictionaryReader.GetStopPositions();
            minstartpos = HLADictionaryReader.GetMinStartPos();
@ -110,26 +112,27 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
            concordance = new double[HLAreads.length];
            numcompared = new double[HLAreads.length];

-            //Read allele frequencies
-            if (ethnicity.equals("Black")){
-                AlleleFrequencyFile = BlackAlleleFrequencyFile;
-            }else{
-                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
-            }
-            out.printf("INFO  Reading HLA allele frequencies ... ");
-            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
-            HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
-            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
-            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
-
-            //FindPolymorphicSites(minstartpos,maxstoppos);
-            
+            //Load list of polymorphic sites
            PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
            siteFileReader.ReadFile(PolymorphicSitesFile);
            PolymorphicSites = siteFileReader.GetPolymorphicSites();
            NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
            numpolymorphicsites = PolymorphicSites.length;
            numnonpolymorphicsites = NonPolymorphicSites.length;
+
+            if (!intervalFile.equals("")){
+                TextFileReader fileReader = new TextFileReader();
+                fileReader.ReadFile(intervalFile);
+                String[] lines = fileReader.GetLines();
+                intervals = new int[lines.length][2];
+                for (int i = 0; i < lines.length; i++) {
+                    String[] s = lines[i].split(":");
+                    String[] intervalPieces = s[1].split("-");
+                    intervals[i][0] = Integer.valueOf(intervalPieces[0]);
+                    intervals[i][1] = Integer.valueOf(intervalPieces[1]);
+                }
+                numIntervals = intervals.length;
+            }
            
            out.printf("INFO  %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
            out.printf("INFO  Comparing reads to database ...\n");
@ -167,7 +170,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
                //Polymorphic sites: always increment denominator, increment numerator when bases are concordant
                for (int j = 0; j < numpolymorphicsites; j++){
                    pos = PolymorphicSites[j];
-                    if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
+                    if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
                        c1 = s1.charAt(pos-readstart);
                        c2 = s2.charAt(pos-allelestart);
                        if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
@ -176,7 +179,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
                                nummatched[i]++;
                            }else{
                                if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
-                                    out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
+                                    out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, pos,c1,c2);
                                }
                            }
                        }
@ -187,13 +190,13 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
                if (numcompared[i] > 0){
                    for (int j = 0; j < numnonpolymorphicsites; j++){
                        pos = NonPolymorphicSites[j];
-                        if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
+                        if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){
                            c1 = s1.charAt(pos-readstart);
                            c2 = s2.charAt(pos-allelestart);
                            if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors)
                                numcompared[i]++;
                                if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
-                                    out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
+                                    out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
                                }
                            }
                        }
@ -205,7 +208,7 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
            concordance[i]=nummatched[i]/numcompared[i];
            if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
            if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
-                out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
+                out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
            }
            if (findFirst && (concordance[i] == 1)){
                break;
@ -228,32 +231,42 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
        return maxFreq;
    }

+    private boolean IsWithinInterval(int pos){
+        boolean isWithinInterval = false;
+        for (int i = 0; i < numIntervals; i++){
+            if (pos >= intervals[i][0] && pos <= intervals[i][1]){
+                isWithinInterval = true;
+                break;
+            }
+        }
+        return isWithinInterval;
+    }
+    
    public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
        //Calculate concordance for this read and all overlapping reads
        if (read.getMappingQuality() > 0){
            double maxConcordance = CalculateConcordance(read);
-
+            String stats = "", topAlleles = "";
            if (maxConcordance > 0){
                String readname = read.getReadName(), allelename = ""; double freq;
                //For input bam files that contain HLA alleles, find and print allele frequency
-                //freq = GetAlleleFrequency(readname);
                out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd());

-                //Find the maximum frequency of the alleles most concordant with the read
-                //double maxFreq = FindMaxAlleleFrequency(maxConcordance);
-
                //Print concordance statistics between this read and the most similar HLA allele(s)

                for (int i = 0; i < HLAreads.length; i++){
                    if (concordance[i] == maxConcordance){
-                        freq = GetAlleleFrequency(HLAnames[i]);
-                        //if (freq == maxFreq){
-                            out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
-                        //}
-                        break;
+                        //freq = GetAlleleFrequency(HLAnames[i]);
+                        if (topAlleles.equals("")){
+                            topAlleles = HLAnames[i];
+                        }else{
+                            topAlleles = topAlleles + "," + HLAnames[i];
+                        }
+                        stats = String.format("%.1f\t%.3f\t%.0f\t%.0f",1.0,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
+                        
                    }
                }
-                out.print("\n");
+                out.printf("\t%s\t%s\n",stats,topAlleles);
            }
        }
        return 1;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java
@ -29,18 +29,12 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
    public boolean ONLYFREQUENT = false;

-    //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
-    
-    SAMFileReader HLADictionaryReader = new SAMFileReader();
+    String AlleleFrequencyFile  = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
+    String UniqueAllelesFile    = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";

-    //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
-    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
-    String AlleleFrequencyFile;
-    String UniqueAllelesFile               = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
-
-    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
+    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
+    String HLAdatabaseFile      = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
+    HLAFileReader HLADictionaryReader = new HLAFileReader();

    boolean DatabaseLoaded = false;
    boolean DEBUG = false;
@ -69,8 +63,8 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
            out.printf("INFO  Loading HLA dictionary ... ");

            HLADictionaryReader.ReadFile(HLAdatabaseFile);
-            HLAreads = HLADictionaryReader.GetReads();
-            HLAnames = HLADictionaryReader.GetReadNames();
+            HLAreads = HLADictionaryReader.GetSequences();
+            HLAnames = HLADictionaryReader.GetNames();
            HLAstartpos = HLADictionaryReader.GetStartPositions();
            HLAstoppos = HLADictionaryReader.GetStopPositions();
            minstartpos = HLADictionaryReader.GetMinStartPos();
@ -82,18 +76,6 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
            concordance = new double[HLAreads.length];
            numcompared = new double[HLAreads.length];

-            //Read allele frequencies
-            if (ethnicity.equals("Black")){
-                AlleleFrequencyFile = BlackAlleleFrequencyFile;
-            }else{
-                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
-            }
-            out.printf("INFO  Reading HLA allele frequencies ... ");
-            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
-            HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile);
-            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
-            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
-
            FindPolymorphicSites(minstartpos,maxstoppos);

            out.printf("INFO  %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
@ -108,13 +90,15 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {

    private void FindPolymorphicSites(int start, int stop){
        boolean initialized, polymorphic, examined;
-        char c = ' ';
+        char c = ' ', ch = ' ';
+        int A = 0, C = 0, G = 0, T = 0;
        ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
        ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
        //Find polymorphic sites in dictionary
        for (int pos = start; pos <= stop; pos++){
            initialized = false; polymorphic = false; examined = false;
            //look across all alleles at specific position to see if it is polymorphic
+            A = 0; C = 0; G = 0; T = 0;
            for (int i = 0; i < HLAreads.length; i++){
                if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
                    if (!initialized){
@ -122,19 +106,28 @@ public class FindPolymorphicSitesWalker extends ReadWalker<Integer, Integer> {
                        initialized = true;
                        examined = true;
                    }
-                    if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
-                        polymorphicsites.add(pos);
-                        out.printf("POLYMORPHIC\t6\t%s\n", pos);
+                    ch = HLAreads[i].charAt(pos-HLAstartpos[i]);
+                    if (ch == 'A'){A++;}
+                    else if (ch == 'C'){C++;}
+                    else if (ch == 'T'){T++;}
+                    else if (ch == 'G'){G++;}
+
+                    if (ch != c){
+                    //    polymorphicsites.add(pos);
+                    //    out.printf("POLYMORPHIC\t6\t%s\n", pos);
                        polymorphic = true;
-                        break;
+                    //    break;
                    }
                }
+            }
+            if (polymorphic){
+                out.printf("%s\t%s\t%s\t%s\t%s\n",pos,A,C,G,T);
+            }
+            //if (!polymorphic && examined){
+            //    nonpolymorphicsites.add(pos);
+            //    out.printf("CONSERVED\t6\t%s\n", pos);
+            //}

-            }
-            if (!polymorphic && examined){
-                nonpolymorphicsites.add(pos);
-                out.printf("CONSERVED\t6\t%s\n", pos);
-            }
        }
        PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
        NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java
@ -7,41 +7,68 @@ import java.util.Hashtable;
 * @author shermanjia
 */
 public class FrequencyFileReader {
-    Hashtable AlleleFrequencies = new Hashtable();
-    Hashtable UniqueAlleles = new Hashtable();
+    Hashtable MaxFrequencies = new Hashtable();
+    Hashtable CommonAlleles = new Hashtable();
+    Hashtable [] AlleleFrequencies = null;
+    String [] Populations = null;

-    public Hashtable GetAlleleFrequencies(){
+    public Hashtable [] GetAlleleFrequencies(){
+        //return allele frequencies for all populations
        return AlleleFrequencies;
    }

-    public Hashtable GetUniqueAlleles(){
-        return UniqueAlleles;
+    public Hashtable GetCommonAlleles(){
+        //return list of common alleles
+        return CommonAlleles;
    }
-    public void ReadFile(String filename, String uniqueAllelesFile){
+
+    public Hashtable GetMaxFrequencies(){
+        //return list of common alleles
+        return MaxFrequencies;
+    }
+
+    public String[] GetPopulations(){
+        //Return name of populations
+        return Populations;
+    }
+
+    public void ReadFile(String filename, String ethnicity){
        try{
+            int linenum = 0;
            FileInputStream fstream = new FileInputStream(filename);
            DataInputStream in = new DataInputStream(fstream);
            BufferedReader br = new BufferedReader(new InputStreamReader(in));
            String strLine; String [] s = null;
            //Read File Line By Line
            while ((strLine = br.readLine()) != null)   {
+                linenum++;
                s = strLine.split("\\t");
-                AlleleFrequencies.put(s[0], s[1]);
-                //System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
-            }
-            in.close();
-
-            fstream = new FileInputStream(uniqueAllelesFile);
-            in = new DataInputStream(fstream);
-            br = new BufferedReader(new InputStreamReader(in));
-            //Read File Line By Line
-            while ((strLine = br.readLine()) != null)   {
-                UniqueAlleles.put(strLine,strLine);
-                //System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
+                if (linenum == 1){
+                    //Determine number of populations, create a hash table for each population
+                    AlleleFrequencies = new Hashtable[s.length-1];
+                    Populations = new String[s.length-1];
+                    for (int i = 1; i < s.length; i++){
+                        Populations[i-1]=s[i];
+                        AlleleFrequencies[i-1] = new Hashtable();
+                    }
+                }else{
+                    //assign allele frequencies for each population
+                    for (int i = 1; i < s.length; i++){
+                        if (Double.valueOf(s[i]) > 0.0001){
+                            CommonAlleles.put(s[0], s[0]);
+                        }
+                        AlleleFrequencies[i-1].put(s[0],s[i]);
+                        if (!MaxFrequencies.containsKey(s[0])){
+                            MaxFrequencies.put(s[0], s[i]);
+                        }else if (Double.valueOf(MaxFrequencies.get(s[0]).toString()) < Double.valueOf(s[i])){
+                            MaxFrequencies.put(s[0], s[i]);
+                        }
+                    }
+                }
            }
            in.close();
        }catch (Exception e){//Catch exception if any
-            System.err.println("FrequencyFileReader Error: " + e.getMessage());
+            System.err.println("Exception in FrequencyFileReader (" + e.getMessage() + ").");
        }
    }
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java
@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.commandline.Argument;
+
+import java.util.*;
+import java.util.Map.Entry;
+
+/**
+ * Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
+dMismatches 7] -ethnicity Caucasian | grep -v "INFO"  | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
+ * @author shermanjia
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class HLACallerWalker extends ReadWalker<Integer, Integer> {
+    @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
+    public String baseLikelihoodsFile = "";
+
+    @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
+    public boolean DEBUG = false;
+
+    @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
+    public String filterFile = "";
+
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "CaucasiansUSA";
+
+    @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
+    public String debugAlleles = "";
+
+    @Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false)
+    public String phaseIntervalFile = "";
+
+    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
+    public boolean ONLYFREQUENT = false;
+
+    @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
+    public int MINALLOWEDMISMATCHES = 7;
+
+    GATKArgumentCollection args = this.getToolkit().getArguments();
+    
+    String AlleleFrequencyFile          = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt";
+    String PolymorphicSitesfile         = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt";
+    String HLAdatabaseFile              = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt";
+
+    // Initializing variables
+    
+    HLAFileReader HLADictionaryReader = new HLAFileReader();
+    boolean HLAdataLoaded = false;
+    String[] HLAnames, HLAreads, Populations;
+    ArrayList<String> ReadsToDiscard;
+    Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
+
+
+    int[][] numObservations, totalObservations, intervals;
+    int[] SNPnumInRead, SNPposInRead, positions;
+    CigarParser cigarparser = new CigarParser();
+    Hashtable MaxLikelihoods = new Hashtable();
+    Hashtable MaxFrequencies, CommonAlleles, AlleleCount, LocusCount;
+    Hashtable[] AlleleFrequencies;
+    int numIntervals;
+    double[][] baseLikelihoods;
+
+    ArrayList AllelesToSearch = new ArrayList<String>();
+
+    // setting error rates for phasing algorithm (1% expected error rate for any genotype)
+    
+    double P_err = 0.01;
+    double P_correct = 1 - P_err;
+    double L_err = Math.log10(P_err);
+    double L_correct = Math.log10(P_correct);
+
+    public Integer reduceInit() {
+
+        if (!HLAdataLoaded){
+            HLAdataLoaded = true;
+
+            //Load HLA dictionary
+
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetSequences();
+            HLAnames = HLADictionaryReader.GetNames();
+            HLAstartpos = HLADictionaryReader.GetStartPositions();
+            HLAstoppos = HLADictionaryReader.GetStopPositions();
+
+            //Load pre-processing file for misaligned reads and list of alleles to search
+            
+            SimilarityFileReader similarityReader = new SimilarityFileReader();
+            similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
+            ReadsToDiscard = similarityReader.GetReadsToDiscard();
+            AllelesToSearch = similarityReader.GetAllelesToSearch();
+            AlleleCount = similarityReader.GetAlleleCount();
+            LocusCount = similarityReader.GetLocusCount();
+            for (int i = 0; i < AllelesToSearch.size(); i++){
+                out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
+            }
+
+            //Load genotypes and find polymorphic sites (sites that differ from reference)
+            
+            BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
+            baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
+            baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods();
+            positions = baseLikelihoodsReader.GetPositions();
+            PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
+            out.printf("INFO\t%s polymorphic sites found\n",PolymorphicSites.length);
+
+            int l = PolymorphicSites.length;
+            SNPnumInRead = new int[l];
+            SNPposInRead = new int[l];
+            numObservations = new int[l*5][l*5];
+            totalObservations = new int[l][l];
+
+            //Load allele frequencies for different populations
+
+            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
+            HLAfreqReader.ReadFile(AlleleFrequencyFile,ethnicity);
+            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
+            MaxFrequencies = HLAfreqReader.GetMaxFrequencies();
+            CommonAlleles = HLAfreqReader.GetCommonAlleles();
+            Populations = HLAfreqReader.GetPopulations();
+
+            //Load genomic intervals for bam file
+            
+            if (!phaseIntervalFile.equals("")){
+                TextFileReader fileReader = new TextFileReader();
+                fileReader.ReadFile(phaseIntervalFile);
+                String[] lines = fileReader.GetLines();
+                intervals = new int[lines.length][2];
+                for (int i = 0; i < lines.length; i++) {
+                    String[] s = lines[i].split(":");
+                    String[] intervalPieces = s[1].split("-");
+                    intervals[i][0] = Integer.valueOf(intervalPieces[0]);
+                    intervals[i][1] = Integer.valueOf(intervalPieces[1]);
+                }
+                numIntervals = intervals.length;
+            }
+
+            
+        }
+        return 0;
+    }
+
+    public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+        if (!ReadsToDiscard.contains(read.getReadName())){
+            UpdateCorrelation(read);
+        }else{
+            
+        }
+        return 1;
+    }
+
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    public void onTraversalDone(Integer numreads) {
+        String name1, name2, d4_name1, d4_name2, d2_name1, d2_name2;
+        Double frq1 = 0.0, frq2 = 0.0, log1 = 0.0, log2 = 0.0,alleleLikelihood= 0.0, phaseLikelihood=0.0, minfrq = 0.0, likelihood = 0.0;
+        int numCombinations = 0;
+
+        //For debugging specific alleles
+        if (!debugAlleles.equals("")){
+            String s[] = debugAlleles.split(",");
+            int index1 = HLADictionaryReader.GetIndex(s[0]);
+            int index2 = HLADictionaryReader.GetIndex(s[1]);
+            out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2);
+            if (index1 > -1 && index2 > -1){
+                alleleLikelihood = CalculateAlleleLikelihood(index1,index2,HLAreads,true);
+                phaseLikelihood = CalculatePhaseLikelihood(index1,index2,true,false);
+            }
+        }
+
+        if (ONLYFREQUENT){
+            minfrq = 0.0001;
+        }
+
+        double max;
+        ArrayList Output = new ArrayList<String>();
+        ArrayList Likelihoods = new ArrayList<Double>();
+        Hashtable TotalProb = new Hashtable();
+        //Search pairs of alleles that satisfy initial search criteria
+
+        // Allele 1
+
+        for (int i = 0; i < HLAnames.length; i++){
+            name1 = HLAnames[i].substring(4);
+            String [] n1 = name1.split("\\*");
+            d4_name1 = n1[0] + "*" + n1[1].substring(0, 4);
+            d2_name1 = n1[0] + "*" + n1[1].substring(0, 2);
+            if (AllelesToSearch.contains(d4_name1)){// || CommonAlleles.contains(d4_name1)
+                
+                if (MaxFrequencies.containsKey(d4_name1)){
+                    frq1 = Double.parseDouble(MaxFrequencies.get(d4_name1).toString());
+                }else{
+                    if (n1[1].length() > 4){if (n1[1].substring(4, 5).equals("N")){frq1 = .00000005;}else{frq1 = .000001;}}else{frq1 = .000001;}
+                }
+
+                // Allele 2
+
+                for (int j = i; j < HLAnames.length; j++){
+                    name2 = HLAnames[j].substring(4);
+                    String [] n2 = name2.split("\\*");
+                    d4_name2 = n2[0] + "*" + n2[1].substring(0, 4);
+                    d2_name2 = n2[0] + "*" + n2[1].substring(0, 2);
+                    if (n1[0].equals(n2[0]) && (AllelesToSearch.contains(d4_name2))){// || CommonAlleles.contains(d4_name2)
+                        if (MaxFrequencies.containsKey(d4_name2)){
+                            frq2 = Double.parseDouble(MaxFrequencies.get(d4_name2).toString());
+                        }else{
+                            if (n2[1].length() > 4){if (n2[1].substring(4, 5).equals("N")){frq2 = .00000005;}else{frq2 = .000001;}}else{frq2 = .000001;}
+                        }
+                        //Calculate allele and phase likelihoods for each allele pair
+                        alleleLikelihood = CalculateAlleleLikelihood(i,j,HLAreads,false);
+                        numCombinations++;
+
+                        //If there is data at the allele pair, continue with other calculations
+
+                        if (alleleLikelihood < 0){
+                            phaseLikelihood = CalculatePhaseLikelihood(i,j,false,false);
+                            log1=Math.log10(frq1);
+                            log2=Math.log10(frq2);
+
+                            //sum likelihoods
+
+                            likelihood = alleleLikelihood+phaseLikelihood+log1+log2;
+                            if (!MaxLikelihoods.containsKey(n1[0])){MaxLikelihoods.put(n1[0], likelihood);}
+
+                            if (likelihood > (Double) MaxLikelihoods.get(n1[0])) {
+                                MaxLikelihoods.put(n1[0], likelihood);
+                            }
+                            Likelihoods.add(likelihood);
+                            String data = String.format("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f",n1[0],name1,name2,alleleLikelihood,phaseLikelihood,log1,log2,likelihood);
+                            Output.add(data);
+                            out.printf("INFO\t%s\n",data);
+                            if (DEBUG){
+                                
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        //Print output
+        out.printf("Locus\tA1\tA2\tGeno\tPhase\tFrq1\tFrq2\tL\tProb\tReads1\tReads2\tLocus\tEXP");
+        for (int i = 0; i < Populations.length; i++){
+            out.printf("\t%s",Populations[i]);
+        }
+        out.printf("\n");
+
+        //Calculate probabilities for each locus
+        Double probSum = 0.0, prob = 0.0, f1 = 0.0, f2 = 0.0, aLikelihood4 = 0.0, pLikelihood4 = 0.0;
+        Integer count = 0;
+        Hashtable HLA4DigitProbs = new Hashtable();
+        Hashtable HLA4DigitLs = new Hashtable();
+        Hashtable HLA4DigitCount = new Hashtable();
+        Hashtable HLA4DigitF1 = new Hashtable();
+        Hashtable HLA4DigitF2 = new Hashtable();
+        Hashtable HLA4DigitA = new Hashtable();
+        Hashtable HLA4DigitP = new Hashtable();
+
+        String key;
+        Enumeration keys = LocusCount.keys();
+        while (keys.hasMoreElements()){
+            String locus = keys.nextElement().toString();
+            
+            probSum = 0.0;
+            ArrayList localOutput = new ArrayList<String>();
+            ArrayList localLikelihoods = new ArrayList<Double>();
+
+            //Sum probabilities for each locus
+
+            for (int j = 0; j < Output.size(); j++){
+                String data = Output.get(j).toString();
+                String [] d = data.split("\\t");
+                if (d[0].equals(locus)){
+                    localOutput.add(data);
+                    likelihood = (Double)Likelihoods.get(j)-(Double)MaxLikelihoods.get(locus);
+                    localLikelihoods.add(likelihood);
+                    probSum = probSum + Math.pow(10, likelihood);
+                    //out.printf("INFO\t%s\t%s\t%.2f\t%.2f\t%.2f\t%s\n",locus,data,likelihood,(Double)MaxLikelihoods.get(locus),(Double)Likelihoods.get(j),probSum);
+                }
+            }
+            
+            //aggregate statistics for 4-digit types
+            
+            String A1 = "", A2 = "", a1 = "", a2 = "";
+            String [] s, s1, s2;
+            Double prob4digit = 0.0;
+            int n = 0;
+
+            for (int j = 0; j < localOutput.size(); j++){
+                String data = localOutput.get(j).toString();
+                prob = Math.pow(10, (Double)localLikelihoods.get(j))/probSum;
+                
+                if (prob > 0.005){
+                    s = data.split("\\t");
+                    s1 = s[1].split("\\*");
+                    s2 = s[2].split("\\*");
+                    a1 = s1[0] + "*" + s1[1].substring(0,4);
+                    a2 = s2[0] + "*" + s2[1].substring(0,4);
+                    key = a1 + "," + a2;
+                    aLikelihood4 = Double.valueOf(s[3]);
+                    pLikelihood4 = Double.valueOf(s[4]);
+                    likelihood = aLikelihood4 + pLikelihood4 + f1 + f2;
+                    f1 = Double.valueOf(s[5]);
+                    f2 = Double.valueOf(s[6]);
+                    if (!HLA4DigitProbs.containsKey(key)){
+                        HLA4DigitProbs.put(key, prob);
+                        HLA4DigitLs.put(key, likelihood);
+                        HLA4DigitCount.put(key, 1);
+                        HLA4DigitF1.put(key,f1);
+                        HLA4DigitF2.put(key,f2);
+                        HLA4DigitA.put(key,aLikelihood4);
+                        HLA4DigitP.put(key,pLikelihood4);
+                    }else{
+                        prob = prob + Double.valueOf(HLA4DigitProbs.get(key).toString());
+                        HLA4DigitProbs.put(key, prob);
+                        likelihood = likelihood + Double.valueOf(HLA4DigitLs.get(key).toString());
+                        HLA4DigitLs.put(key, likelihood);
+                        n = Integer.valueOf(HLA4DigitCount.get(key).toString()) + 1;
+                        HLA4DigitCount.put(key, n);
+                        aLikelihood4 = aLikelihood4 + Double.valueOf(HLA4DigitA.get(key).toString());
+                        HLA4DigitA.put(key, aLikelihood4);
+                        pLikelihood4 = pLikelihood4 + Double.valueOf(HLA4DigitP.get(key).toString());
+                        HLA4DigitP.put(key, pLikelihood4);
+                    }
+                    
+                }
+            }
+        }
+
+        //Print results
+        Enumeration P = HLA4DigitProbs.keys();
+        String K = ""; String [] s, s1, s2;
+        double count1, count2, locusCount, accountedFor;
+
+        // Sort hashtable.
+        Vector v = new Vector(HLA4DigitProbs.keySet());
+        Collections.sort(v);
+
+        // Display (sorted) hashtable.
+        for (Enumeration e = v.elements(); e.hasMoreElements();) {
+            K = (String)e.nextElement();
+            prob = (Double) HLA4DigitProbs.get(K);
+
+            likelihood = (Double) HLA4DigitLs.get(K);
+            count = (Integer) HLA4DigitCount.get(K);
+            s = K.split("\\,");
+            s1 = s[0].split("\\*"); name1 = s1[1];
+            s2 = s[1].split("\\*"); name2 = s2[1];
+            aLikelihood4 = (Double) HLA4DigitA.get(K);
+            pLikelihood4 = (Double) HLA4DigitP.get(K);
+            f1 = (Double) HLA4DigitF1.get(K);
+            f2 = (Double) HLA4DigitF2.get(K);
+            count1 = Double.valueOf(AlleleCount.get(s[0]).toString());
+            count2 = Double.valueOf(AlleleCount.get(s[1]).toString());
+            locusCount = Double.valueOf(LocusCount.get(s1[0]).toString());
+            if (s[0].equals(s[1])){
+                accountedFor = count1 / locusCount;
+            }else{
+                accountedFor = (count1 + count2) / locusCount;
+            }
+            if (prob > 0.1){
+                out.printf("%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
+                for (int i = 0; i < Populations.length; i++){
+                    if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
+                    if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
+                    if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
+                }
+                out.print("\n");
+            }
+            out.printf("INFO\t%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor);
+            for (int i = 0; i < Populations.length; i++){
+                if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;}
+                if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;}
+                if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");}
+            }
+            out.print("\n");
+        }
+    }
+
+    Comparator<Double> valueComparator = new Comparator<Double>() {
+        @Override public int compare(Double val1, Double val2) {
+            return val1.compareTo(val2);
+        }
+    };
+    
+    private Integer[] InitializePolymorphicSites(){
+        int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
+        int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
+        int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
+        Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
+        for (int i = 0; i < num_A_positions; i++){
+            polymorphicSites[i]=HLA_A_start + i;
+        }
+        for (int i = 0; i < num_C_positions; i++){
+            polymorphicSites[i+num_A_positions]=HLA_C_start + i;
+        }
+        for (int i = 0; i < num_B_positions; i++){
+            polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
+        }
+        return polymorphicSites;
+    }
+
+    private int IndexOf(char c){
+        switch(c){
+            case 'A': return 0;
+            case 'C': return 1;
+            case 'G': return 2;
+            case 'T': return 3;
+            //case 'D': return 4;
+            default: return -1;
+        }
+    }
+
+
+    private boolean IsWithinInterval(int pos){
+        boolean isWithinInterval = false;
+        for (int i = 0; i < numIntervals; i++){
+            if (pos >= intervals[i][0] && pos <= intervals[i][1]){
+                isWithinInterval = true;
+                break;
+            }
+        }
+        return isWithinInterval;
+    }
+    
+    private void UpdateCorrelation(SAMRecord read){
+        //Updates correlation table with SNPs from specific read (for phasing)
+        String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
+        ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
+        ArrayList<Integer> readindex = new ArrayList<Integer>();
+
+        int readstart = read.getAlignmentStart();
+        int readend = read.getAlignmentEnd();
+        int numPositions = PolymorphicSites.length;
+        char c1, c2;
+        int a, b, i, j, SNPcount = 0;
+        
+        //Find all SNPs in read
+        for (i = 0; i < numPositions; i++){
+            if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
+                SNPnumInRead[i] = SNPcount;
+                SNPposInRead[i] = PolymorphicSites[i]-readstart;
+                SNPcount++;
+            }else{
+                SNPnumInRead[i] = -1;
+                SNPposInRead[i] = -1;
+            }
+        }
+
+        //Update correlation table; for each combination of SNP positions
+        for (i = 0; i < numPositions; i++){
+            if (SNPnumInRead[i] > -1){
+                c1 = s.charAt(SNPposInRead[i]);
+                if (IndexOf(c1) > -1){
+                    for (j = i+1; j < numPositions; j ++){
+                        if (SNPnumInRead[j] > -1){
+                            c2 = s.charAt(SNPposInRead[j]);
+                            if (IndexOf(c2) > -1){
+                                a = i*5 + IndexOf(c1);
+                                b = j*5 + IndexOf(c2);
+
+                                numObservations[a][b]++;
+                                totalObservations[i][j]++;
+                                if (DEBUG){
+                                    //out.printf("INFO  %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
+                                }
+                            }
+                        }
+                    }
+
+                }
+            }
+        }
+    }
+
+private int GenotypeIndex(char a, char b){
+        switch(a){
+            case 'A':
+                switch(b){
+                    case 'A': return 0;
+                    case 'C': return 1;
+                    case 'G': return 2;
+                    case 'T': return 3;
+                };
+            case 'C':
+                switch(b){
+                    case 'A': return 1;
+                    case 'C': return 4;
+                    case 'G': return 5;
+                    case 'T': return 6;
+                };
+            case 'G':
+                switch(b){
+                    case 'A': return 2;
+                    case 'C': return 5;
+                    case 'G': return 7;
+                    case 'T': return 8;
+                };
+            case 'T':
+                switch(b){
+                    case 'A': return 3;
+                    case 'C': return 6;
+                    case 'G': return 8;
+                    case 'T': return 9;
+                };
+            default: return -1;
+        }
+    }
+
+    private double CalculateAlleleLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){
+        //Calculates likelihood for specific allele pair
+        String read1 = HLAalleles[a1];
+        String read2 = HLAalleles[a2];
+        int start1 = HLAstartpos[a1];
+        int start2 = HLAstartpos[a2];
+        int stop1 = HLAstoppos[a1];
+        int stop2 = HLAstoppos[a2];
+        double likelihood = 0;
+        int pos, index;
+        char c1, c2;
+
+
+        for (int i = 0; i < positions.length; i++){
+            pos = positions[i];
+            if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){
+                index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2));
+                if (index > -1){
+                    likelihood = likelihood + baseLikelihoods[i][index];
+                    if (debug){
+                        c1 = read1.charAt(pos-start1);
+                        c2 = read2.charAt(pos-start2);
+                        out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood);
+                    }
+                }
+            }
+        }
+        return likelihood;
+    }
+
+    private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG, boolean SINGLEALLELE){
+        //calculate the likelihood that the particular combination of alleles satisfies the phase count data
+        double likelihood = 0, prob = 0;
+        int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
+        int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
+        int combinedstart = Math.max(readstart1,readstart2);
+        int combinedstop  = Math.min(readend1,readend2);
+        
+        int numPositions = PolymorphicSites.length, SNPcount = 0;
+        int i, j, a1, a2, b1, b2;
+        char c11, c12, c21, c22;
+        int numInPhase = 0, numOutOfPhase = 0;
+        double sumInPhase = 0.0, sumObservations = 0.0;
+
+
+        //Find all SNPs in read
+        for (i = 0; i < numPositions; i++){
+
+            if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop  ){ // && IsWithinInterval(PolymorphicSites[i])
+                if (PRINTDEBUG){
+                    out.printf("DEBUG\t%s\t%s\n",PolymorphicSites[i],SNPcount);
+                }
+                SNPnumInRead[i] = SNPcount;
+                SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
+                SNPcount++;
+            }else{
+                SNPnumInRead[i] = -1;
+                SNPposInRead[i] = -1;
+            }
+        }
+        String s1 = HLAreads[alleleIndex1];
+        String s2 = HLAreads[alleleIndex2];
+        if (PRINTDEBUG){
+            out.printf("DEBUG  %s SNPs found in %s and %s between %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2],combinedstart,combinedstop);
+        }
+        //Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
+        for (i = 0; i < numPositions; i++){
+            if (SNPnumInRead[i] > -1){
+                c11 = s1.charAt(SNPposInRead[i]);
+                c21 = s2.charAt(SNPposInRead[i]);
+                if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
+                    for (j = i+1; j < numPositions; j ++){
+                        if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
+                            c12 = s1.charAt(SNPposInRead[j]);
+                            c22 = s2.charAt(SNPposInRead[j]);
+                            if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
+                                a1 = i*5 + IndexOf(c11);
+                                b1 = j*5 + IndexOf(c12);
+                                a2 = i*5 + IndexOf(c21);
+                                b2 = j*5 + IndexOf(c22);
+                                //check if the two alleles are identical at the chosen 2 locations
+                                if ((c11 == c21) && (c12 == c22)){
+                                    numInPhase = numObservations[a1][b1];
+                                }else{
+                                    numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
+                                }
+                                numOutOfPhase = totalObservations[i][j] - numInPhase;
+                                sumInPhase += (double) numInPhase;
+                                sumObservations += (double) totalObservations[i][j];
+                                if (SINGLEALLELE){
+                                    likelihood = sumInPhase / sumObservations;
+                                }else{
+                                    likelihood += numInPhase * L_correct + numOutOfPhase * L_err;
+                                }
+                                //prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
+                                //likelihood += Math.log10(prob);
+                                //likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10);
+                            
+                                if (PRINTDEBUG){
+                                    out.printf("DEBUG  %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t    %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood);
+                                }
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return likelihood;
+    }
+
+    private void ExtraCode(){
+        String name1, name2;
+        //Pre-process homozygous combinations to determine top possible alleles (for efficiency)
+        Hashtable Alleles2Digit = new Hashtable();
+        Hashtable Phase2Digit = new Hashtable();
+        Hashtable Count2Digit = new Hashtable();
+        
+        Hashtable AllelesAtLocus = new Hashtable();
+        ArrayList Loci = new ArrayList<String>();
+        double[] AlleleLikelihoods2 = new double[HLAnames.length];
+        double[] PhaseLikelihoods2 = new double[HLAnames.length];
+        for (int i = 0; i < HLAnames.length; i++){
+            name1 = HLAnames[i].substring(4);
+            String [] n1 = name1.split("\\*");
+            AlleleLikelihoods2[i] = CalculateAlleleLikelihood(i,i,HLAreads,false);
+            PhaseLikelihoods2[i] = CalculatePhaseLikelihood(i,i,false,true);
+            if (AlleleLikelihoods2[i] < 0){
+                name2 = n1[0] + "*" + n1[1].substring(0, 4);
+                if (!Loci.contains(n1[0])){
+                    Loci.add(n1[0]);
+                    MaxLikelihoods.put(n1[0], 0.0);
+                    AllelesAtLocus.put(n1[0], 1);
+                }else{
+                    AllelesAtLocus.put(n1[0], 1+(Integer)AllelesAtLocus.get(n1[0]));
+                }
+                if (!Alleles2Digit.containsKey(name2)){
+                    Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
+                    Phase2Digit.put(name2, PhaseLikelihoods2[i]);
+                    Count2Digit.put(name2, 1.0);
+                }else {
+                    if (AlleleLikelihoods2[i] > (Double) Alleles2Digit.get(name2)){
+                        Alleles2Digit.put(name2, AlleleLikelihoods2[i]);
+                    }
+                    if (PhaseLikelihoods2[i] > (Double) Phase2Digit.get(name2)){
+                        Phase2Digit.put(name2, PhaseLikelihoods2[i]);
+                    }
+                    Count2Digit.put(name2,1.0+(Double)Count2Digit.get(name2));
+                }
+            }
+        }
+
+        //Sort alleles at 2 digit resolution for each locus
+
+        for (int i = 0; i < Loci.size(); i++){
+            Enumeration k = Alleles2Digit.keys();
+            Hashtable AllelesAtLoci = new Hashtable();
+            HashMap map = new HashMap();
+            int numalleles = 0;
+            //find alleles at the locus
+            while( k.hasMoreElements() ){
+                name1 = k.nextElement().toString();
+                String [] n1 = name1.split("\\*");
+                if (Loci.get(i).equals(n1[0])){
+                    numalleles++;
+                    map.put(name1,-1 * (Double) Alleles2Digit.get(name1));
+                    AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1);
+                    //out.printf("%s\t%.2f\n",name1,-1 * (Double) Alleles2Digit.get(name1));
+                }
+
+            }
+
+            //Sort alleles at locus, mark top six 2-digit classes for deep search
+            List<Map.Entry<String, Double>> entries = new ArrayList<Entry<String, Double>>(map.entrySet());
+            Collections.sort(entries, new Comparator<Entry<String, Double>>() {
+                public int compare(Entry<String, Double> e1, Entry<String, Double> e2) {
+                    return e1.getValue().compareTo(e2.getValue());
+                }
+            });
+            int num = 1;
+            for (Map.Entry<String, Double> entry : entries) {
+                if (num <= Math.max(5,entries.size()/8)){
+                    AllelesToSearch.add(entry.getKey());
+                    out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
+                    num++;
+                }else{
+                    if (!AllelesToSearch.contains(entry.getKey())){
+                        out.printf("INFO\t%s\t%.2f\t%.2f\tNotSearched\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
+                    }else{
+                        out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey()));
+                    }
+                }
+            }
+
+            out.printf("INFO\n");
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java
@ -0,0 +1,79 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import java.io.*;
+import java.util.ArrayList;
+/**
+ *
+ * @author shermanjia
+ */
+public class HLAFileReader {
+    ArrayList<String> Sequences = new ArrayList<String>();
+    ArrayList<String> Names = new ArrayList<String>();
+    ArrayList<Integer> StartPositions = new ArrayList<Integer>();
+    ArrayList<Integer> StopPositions = new ArrayList<Integer>();
+    int minstartpos;
+    int maxstoppos;
+
+    CigarParser formatter = new CigarParser();
+
+    public String[] GetNames(){
+        return Names.toArray(new String[Names.size()]);
+    }
+
+    public String[] GetSequences(){
+        return Sequences.toArray(new String[Sequences.size()]);
+    }
+
+    public Integer[] GetStartPositions(){
+        return StartPositions.toArray(new Integer[StartPositions.size()]);
+    }
+
+    public Integer[] GetStopPositions(){
+        return StopPositions.toArray(new Integer[StopPositions.size()]);
+    }
+
+
+    public Integer GetMinStartPos(){
+        return minstartpos;
+    }
+
+    public Integer GetMaxStopPos(){
+        return maxstoppos;
+    }
+    
+    public int GetIndex(String readname){
+        if (Names.contains(readname)){
+            return Names.indexOf(readname);
+        }else{
+            return -1;
+        }
+    }
+
+    public void ReadFile(String filename){
+        try{
+            FileInputStream fstream = new FileInputStream(filename);
+            DataInputStream in = new DataInputStream(fstream);
+            BufferedReader br = new BufferedReader(new InputStreamReader(in));
+            String strLine; String [] s = null;
+            //Read File Line By Line
+            while ((strLine = br.readLine()) != null)   {
+                s = strLine.split("\\t");
+                Sequences.add(s[3]);
+                Names.add(s[0]);
+                StartPositions.add(Integer.valueOf(s[1]));
+                StopPositions.add(Integer.valueOf(s[2]));
+                minstartpos = Math.min(minstartpos, Integer.valueOf(s[1]));
+                maxstoppos = Math.max(maxstoppos, Integer.valueOf(s[2]));
+            }
+            in.close();
+        }catch (Exception e){//Catch exception if any
+            System.err.println("HLAFileReader Error: " + e.getMessage());
+        }
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java
@ -24,6 +24,14 @@ public class PolymorphicSitesFileReader {
        return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
    }

+    public void AddSites(Integer [] sites){
+        for (int i = 0; i < sites.length; i++){
+            if (!PolymorphicSites.contains(sites[i])){
+                PolymorphicSites.add(sites[i]);
+            }
+        }
+    }
+
    public void ReadFile(String filename){
        try{
            FileInputStream fstream = new FileInputStream(filename);
@ -34,10 +42,10 @@ public class PolymorphicSitesFileReader {
            int i = 0;
            while ((strLine = br.readLine()) != null)   {
                s = strLine.split("\\t");
-                if (s[0].equals("POLYMORPHIC")){
-                    PolymorphicSites.add(Integer.valueOf(s[2]));
+                if (Double.valueOf(s[8]) > 0.1){
+                    PolymorphicSites.add(Integer.valueOf(s[0]));
                }else{
-                    NonPolymorphicSites.add(Integer.valueOf(s[2]));
+                    NonPolymorphicSites.add(Integer.valueOf(s[0]));
                }
            }
            in.close();
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java
@ -14,6 +14,9 @@ import java.util.Hashtable;
 */
 public class SimilarityFileReader {
    ArrayList<String> ReadsToDiscard = new ArrayList<String>();
+    ArrayList<String> AllelesToSearch = new ArrayList<String>();
+    Hashtable AlleleCount = new Hashtable();
+    Hashtable LocusCount = new Hashtable();
    Hashtable Concordance = new Hashtable();
    Hashtable NumMatches = new Hashtable();
    Hashtable NumMismatches = new Hashtable();
@ -22,10 +25,22 @@ public class SimilarityFileReader {
        return ReadsToDiscard;
    }

+    public ArrayList<String> GetAllelesToSearch(){
+        return AllelesToSearch;
+    }
+
    public String[] GetReadsToDiscardArray(){
        return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
    }

+    public Hashtable GetAlleleCount(){
+        return AlleleCount;
+    }
+
+    public Hashtable GetLocusCount(){
+        return LocusCount;
+    }
+
    public Hashtable GetConcordance(){
        return Concordance;
    }
@ -43,26 +58,56 @@ public class SimilarityFileReader {
            FileInputStream fstream = new FileInputStream(filename);
            DataInputStream in = new DataInputStream(fstream);
            BufferedReader br = new BufferedReader(new InputStreamReader(in));
-            String strLine; String [] s = null;
+            String strLine; String [] s = null, alleles = null, a = null; String allele;
            //Read File Line By Line
            int i = 0;
            while ((strLine = br.readLine()) != null)   {
                s = strLine.split("\\t");
                if (s.length >= 6){
-                    Double matchFraction = Double.valueOf(s[4]);
-                    int numMismatches = Integer.valueOf(s[6]);
-
+                    Double matchFraction = Double.valueOf(s[3]);
+                    int numMismatches = Integer.valueOf(s[5]);
+                    int numMatches = Integer.valueOf(s[4]);
                    Concordance.put(s[0],matchFraction);
-                    NumMatches.put(s[0], s[5]);
+                    NumMatches.put(s[0], s[4]);
                    NumMismatches.put(s[0], numMismatches);
-                    if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){
+                    if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){
                        ReadsToDiscard.add(s[0]);
+                    }else{
+                        Hashtable fourDigitAlleles = new Hashtable();
+                        alleles = s[6].split("\\,");
+                        if (alleles.length > 0){
+                            a = alleles[0].split("\\_");
+                            s = a[1].split("\\*");
+                            if (!LocusCount.containsKey(s[0])){
+                                LocusCount.put(s[0], 1);
+                            }else{
+                                LocusCount.put(s[0], (Integer) LocusCount.get(s[0]) + 1);
+                            }
+                        }
+                        for (int j = 0; j < alleles.length; j++){
+                            a = alleles[j].split("\\_");
+                            s = a[1].split("\\*");
+                            allele = s[0] + "*" + s[1].substring(0,4);
+                            
+                            if (!fourDigitAlleles.containsKey(allele)){
+                                fourDigitAlleles.put(allele, allele);
+                                if (!AlleleCount.containsKey(allele)){
+                                    AlleleCount.put(allele, 1);
+                                }else{
+                                    AlleleCount.put(allele, (Integer) AlleleCount.get(allele) + 1);
+                                }
+
+                                if ((Integer) AlleleCount.get(allele) > 1 && !AllelesToSearch.contains(allele)){
+                                    AllelesToSearch.add(allele);
+                                }
+                            }
+                        }
                    }
                }
            }
            in.close();
        }catch (Exception e){//Catch exception if any
-            System.err.println("SimilarityFile Error: " + e.getMessage());
+            //System.err.println("SimilarityFile Error: " + e.getMessage());
        }
    }
 }