HLA Caller 2.0 Walkers:

CalculateBaseLikelihoodsWalker.java walks through reads calculates likelihoods using SSG at each base position CalculateAlleleLikelihoodsWalker.java walks through HLA dictionary and calculates likelihoods for allele pairs given output of CalculateBaseLikelihoodsWalker.java CalculatePhaseLikelihoodsWalker.java walks through reads and calculates likelihoods score for allele pairs given phase information File Readers: BaseLikelihoodsFileReader.java reads text file of likelihoods outputted by SSG FrequencyFileReader.java reads text file of HLA allele frequencies PolymorphicSitesFileReader.java reads text file of polymorphic sites in the HLA dictionary SAMFileReader.java reads a sam file (used to read HLA dictionary when in another walker) SimilarityFileReader.java reads a text file of how similar each read is to the closest HLA allele (used to filter misaligned reads) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1744 348d0f76-0448-11de-a6fe-93d51630548a
2009-09-29 20:45:55 +00:00 · 2009-09-29 20:45:55 +00:00 · 9b78a789e2
parent 281a77c981
commit 9b78a789e2
13 changed files with 1347 additions and 239 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java
@ -0,0 +1,102 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import java.io.*;
+import java.util.ArrayList;
+/**
+ *
+ * @author shermanjia
+ */
+public class BaseLikelihoodsFileReader {
+    double[][] baseLikelihoods;
+    int[] positions;
+    ArrayList<Integer> polymorphicSites = new ArrayList<Integer>();
+
+    public Integer[] GetPolymorphicSites(){
+        return polymorphicSites.toArray(new Integer[polymorphicSites.size()]);
+    }
+
+    public double[][] GetBaseLikelihoods(){
+        return baseLikelihoods;
+    }
+
+    public int[] GetPositions(){
+        return positions;
+    }
+
+    public void ReadFile(String filename, boolean findPolymorphicSites){
+        try{
+            //System.out.printf("INFO  Reading base likelihoods file ... ");
+            FileInputStream fstream = new FileInputStream(filename);
+            DataInputStream in = new DataInputStream(fstream);
+            BufferedReader br = new BufferedReader(new InputStreamReader(in));
+            String strLine; String [] s = null, pos = null;
+            //Determine size of file
+            int n = 0; char ref;
+            while ((strLine = br.readLine()) != null)   {
+                if (strLine.indexOf("INFO") == -1){n++;}
+            }
+
+            baseLikelihoods = new double[n][10];
+            positions = new int[n];
+            double[] localLikelihoods = new double[10];
+            
+
+            //System.out.printf("%s lines of data found ... ",n);
+            in.close();
+            //Read and store data
+
+            fstream = new FileInputStream(filename);
+            in = new DataInputStream(fstream);
+            br = new BufferedReader(new InputStreamReader(in));
+            n = 0;
+            while ((strLine = br.readLine()) != null)   {
+                if (strLine.indexOf("INFO") == -1){
+                    s = strLine.split("\\t");
+                    pos = s[0].split(":");
+                    ref = s[1].charAt(0);
+                    positions[n] = Integer.valueOf(pos[1]);
+                    for (int i = 3; i <= 12; i++){
+                        baseLikelihoods[n][i-3] = Double.valueOf(s[i]);
+                        localLikelihoods[i-3] = baseLikelihoods[n][i-3];
+                    }
+                    if (findPolymorphicSites){
+                        if (IsPolymorphicSite(localLikelihoods,ref)){
+                            polymorphicSites.add(positions[n]);
+                        }
+                    }
+                    n++;
+                }
+            }
+
+        }catch (Exception e){//Catch exception if any
+            System.err.println("BaseLikelihoodsFileReader Error: " + e.getMessage());
+        }
+    }
+
+    private int IndexOf(char c){
+        switch(c){
+            case 'A': return 0;
+            case 'C': return 4;
+            case 'G': return 7;
+            case 'T': return 9;
+            default: return -1;
+        }
+    }
+
+    private boolean IsPolymorphicSite(double[] likelihoods, char ref){
+        boolean isPolymorphicSite = false;
+        double homreflikelihood = likelihoods[IndexOf(ref)];
+        for (int i = 0; i < 10; i++){
+            if (likelihoods[i] > homreflikelihood){
+                isPolymorphicSite = true;
+            }
+        }
+        return isPolymorphicSite;
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java
@ -0,0 +1,209 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import java.io.FileInputStream;
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Hashtable;
+
+
+/**
+ *
+ * @author shermanjia
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class CalculateAlleleLikelihoodsWalker extends ReadWalker<Integer, Integer> {
+    @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
+    public String baseLikelihoodsFile = "";
+
+    @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
+    public boolean DEBUG = false;
+
+    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
+    public boolean FREQUENT = false;
+
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "Caucasian";
+
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
+    String AlleleFrequencyFile;
+    Hashtable AlleleFrequencies;
+    
+    CigarParser formatter = new CigarParser();
+    double[][] baseLikelihoods;
+    int[] positions;
+    boolean loaded = false;
+
+    String[] HLAnames, HLAreads;
+    Integer[] HLAstartpos, HLAstoppos;
+    ArrayList<String> HLAnamesAL, HLAreadsAL;
+    ArrayList<Integer> HLAstartposAL, HLAstopposAL;
+
+    public Integer reduceInit() {
+        if (!loaded){
+            loaded = true;
+            BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
+            baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, false);
+            baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods();
+            positions = baseLikelihoodsReader.GetPositions();
+
+            HLAnamesAL = new ArrayList<String>();
+            HLAreadsAL = new ArrayList<String>();
+            HLAstartposAL = new ArrayList<Integer>();
+            HLAstopposAL = new ArrayList<Integer>();
+
+            if (ethnicity.equals("Black")){
+                AlleleFrequencyFile = BlackAlleleFrequencyFile;
+            }else{
+                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
+            }
+            out.printf("INFO  Reading HLA allele frequencies ... ");
+            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
+            HLAfreqReader.ReadFile(AlleleFrequencyFile);
+            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
+            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
+
+            out.printf("INFO  Reading HLA dictionary ...");
+
+
+        }
+        return 0;
+    }
+
+    public Integer map(char[] ref, SAMRecord read) {
+        HLAnamesAL.add(read.getReadName());
+        HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString()));
+        HLAstartposAL.add(read.getAlignmentStart());
+        HLAstopposAL.add(read.getAlignmentEnd());
+        return 1;
+    }
+
+    private int GenotypeIndex(char a, char b){
+        switch(a){
+            case 'A':
+                switch(b){
+                    case 'A': return 0;
+                    case 'C': return 1;
+                    case 'G': return 2;
+                    case 'T': return 3;
+                };
+            case 'C':
+                switch(b){
+                    case 'A': return 1;
+                    case 'C': return 4;
+                    case 'G': return 5;
+                    case 'T': return 6;
+                };
+            case 'G':
+                switch(b){
+                    case 'A': return 2;
+                    case 'C': return 5;
+                    case 'G': return 7;
+                    case 'T': return 8;
+                };
+            case 'T':
+                switch(b){
+                    case 'A': return 3;
+                    case 'C': return 6;
+                    case 'G': return 8;
+                    case 'T': return 9;
+                };
+            default: return -1;
+        }
+    }
+
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    public void onTraversalDone(Integer numreads) {
+        out.printf("Done! %s alleles found\n", numreads);
+        HLAnames = HLAnamesAL.toArray(new String[numreads]);
+        HLAreads = HLAreadsAL.toArray(new String[numreads]);
+        HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]);
+        HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]);
+
+        double[][] AlleleLikelihoods = new double[numreads][numreads];
+
+        String name1, name2;
+        double frq1, frq2;
+
+
+        double minfrq = 0;
+        if (FREQUENT){
+            minfrq = 0.0001;
+        }
+        int numcombinations = 0;
+        out.printf("NUM\tAllele1\tAllele2\tSSG\n");
+        
+        for (int i = 0; i < numreads; i++){
+            name1 = HLAnames[i].substring(4);
+            if (AlleleFrequencies.containsKey(name1)){
+                frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
+                if (frq1 > minfrq){
+                    for (int j = i; j < numreads; j++){
+                        name2 = HLAnames[j].substring(4);
+                        if (AlleleFrequencies.containsKey(name2)){
+                            frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
+                            if (frq2 > minfrq){
+                                if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){
+                                    numcombinations++;
+                                    AlleleLikelihoods[i][j] = CalculateLikelihood(i,j);
+                                    out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]);
+                                }
+                            }else{
+                                if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);}
+                            }
+                        }else{
+                            if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);}
+                        }
+                    }
+                }else{
+                    if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);}
+                }
+            }else{
+                if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);}
+            }
+        }
+    }
+
+    private double CalculateLikelihood(int a1, int a2){
+        String read1 = HLAreads[a1];
+        String read2 = HLAreads[a2];
+        int start1 = HLAstartpos[a1];
+        int start2 = HLAstartpos[a2];
+        int stop1 = HLAstoppos[a1];
+        int stop2 = HLAstoppos[a2];
+        double likelihood = 0;
+        int pos, index;
+        char c1, c2;
+        
+
+        for (int i = 0; i < positions.length; i++){
+            pos = positions[i];
+            if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){
+                index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2));
+                if (index > -1){
+                    likelihood = likelihood + baseLikelihoods[i][index];
+                }else{
+                    /*if (DEBUG){
+                        c1 = read1.charAt(pos-start1);
+                        c2 = read2.charAt(pos-start2);
+                        out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood);
+                    }*/
+                }
+            }
+        }
+        return likelihood;
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java
@ -0,0 +1,163 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.genotyper.*;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+/**
+ *
+ * @author shermanjia
+ */
+public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Long, Long>>{
+    @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
+    public boolean DEBUG = false;
+    @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
+    public String inputAlleles = "";
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "Caucasian";
+    @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
+    public String filterFile = "";
+
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
+    String AlleleFrequencyFile;
+
+    SAMFileReader HLADictionaryReader = new SAMFileReader();
+    String[] HLAreads, HLAnames;
+    Integer[] HLAstartpos, HLAstoppos;
+    Hashtable AlleleFrequencies;
+
+    int[][] LOD, LikelihoodScores;
+    ArrayList<String> ReadsToDiscard = new ArrayList<String>();
+
+    ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
+    ArrayList<String> AllReadNames = new ArrayList<String>();
+
+    boolean HLAdataLoaded = false;
+
+
+    //Loads HLA dictionary, allele frequencies, and reads to filter
+    public Pair<Long, Long> reduceInit() {
+        if (!HLAdataLoaded){
+            HLAdataLoaded = true;
+            
+            out.printf("INFO  Reading HLA database ... ");
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetReads();
+            HLAnames = HLADictionaryReader.GetReadNames();
+            HLAstartpos = HLADictionaryReader.GetStartPositions();
+            HLAstoppos = HLADictionaryReader.GetStopPositions();
+            InitializeVariables(HLAreads.length);
+            out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
+
+
+
+            if (ethnicity.equals("Black")){
+                AlleleFrequencyFile = BlackAlleleFrequencyFile;
+            }else{
+                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
+            }
+            out.printf("INFO  Reading HLA allele frequencies ... ");
+            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
+            HLAfreqReader.ReadFile(AlleleFrequencyFile);
+            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
+            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
+
+            
+            if (!filterFile.equals("")){
+                out.printf("INFO  Reading properties file ... ");
+                SimilarityFileReader similarityReader = new SimilarityFileReader();
+                similarityReader.ReadFile(filterFile);
+                ReadsToDiscard = similarityReader.GetReadsToDiscard();
+                out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
+            }
+        }
+        return new Pair<Long,Long>(0l,0l);
+    }
+
+
+
+    private void InitializeVariables(int n){
+        LOD = new int[n][n];
+        LikelihoodScores = new int[n][n];
+        for (int i = 0; i < n; i++){
+
+            for (int j = 0; j <n; j++){
+                LOD[i][j]=0;
+                LikelihoodScores[i][j]=0;
+            }
+        }
+    }
+
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
+        List<SAMRecord> reads = context.getReads();
+        if(reads.size() > 0 ) {
+            List<Integer> offsets = context.getOffsets();
+
+            int numAs = 0, numCs = 0, numGs = 0, numTs = 0;
+            //if (DEBUG){
+                out.printf("%s\t%s\t", context.getLocation(),ref.getBase());
+            //}
+
+            //Calculate posterior probabilities
+            GenotypeLikelihoods G = new ThreeStateErrorGenotypeLikelihoods();
+            SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
+
+            //Check for bad bases and ensure mapping quality
+            for (int i = 0; i < reads.size(); i++) {
+                read = reads.get(i);
+                offset = offsets.get(i);
+                base = read.getReadString().charAt(offset);
+                qual = read.getBaseQualities()[offset];
+                //mapquality = read.getMappingQuality();
+                if (!ReadsToDiscard.contains(read.getReadName()) && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
+                    
+                    //consider base in likelihood calculations if it looks good and has high mapping score
+                    G.add(base, qual, read, offset);
+                    if (DEBUG){
+                        if (base == 'A'){numAs++;}
+                        else if (base == 'C'){numCs++;}
+                        else if (base == 'T'){numTs++;}
+                        else if (base == 'G'){numGs++;}
+                    }
+
+                }
+            }
+            //if (DEBUG) {
+                out.printf("A[%s]C[%s]T[%s]G[%s]",numAs,numCs,numTs,numGs);
+                for ( DiploidGenotype g : DiploidGenotype.values() ) {
+                    out.printf("\t%.2f",G.getLikelihood(g));
+                }
+                out.printf("\n");
+            //}
+            
+        }
+        return context.getReads().size();
+    }
+
+    public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum) {
+        long left = value.longValue() + sum.getFirst();
+        long right = sum.getSecond() + 1l;
+        return new Pair<Long,Long>(left, right);
+    }
+
+    public void onTraversalDone(Pair<Long, Long> result) {
+        
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java
@ -0,0 +1,314 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import java.util.ArrayList;
+import java.util.Hashtable;
+
+/**
+ *
+ * @author shermanjia
+ */
+@Requires({DataSource.READS, DataSource.REFERENCE})
+public class CalculatePhaseLikelihoodsWalker extends ReadWalker<Integer, Integer> {
+    @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true)
+    public String baseLikelihoodsFile = "";
+
+    @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
+    public boolean DEBUG = false;
+
+    @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
+    public String filterFile = "";
+
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "Caucasian";
+
+    @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
+    public String debugAlleles = "";
+
+    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
+    public boolean ONLYFREQUENT = false;
+
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
+    String AlleleFrequencyFile;
+    
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
+    SAMFileReader HLADictionaryReader = new SAMFileReader();
+    boolean HLAdataLoaded = false;
+    String[] HLAnames, HLAreads;
+    ArrayList<String> ReadsToDiscard;
+    Integer[] HLAstartpos, HLAstoppos, PolymorphicSites;
+    
+    int[][] numObservations, totalObservations;
+    int[] SNPnumInRead, SNPposInRead;
+    CigarParser cigarparser = new CigarParser();
+    Hashtable AlleleFrequencies;
+
+    public Integer reduceInit() {
+
+        if (!HLAdataLoaded){
+            HLAdataLoaded = true;
+            //Load HLA dictionary
+            out.printf("INFO  Loading HLA dictionary ... ");
+            
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetReads();
+            HLAnames = HLADictionaryReader.GetReadNames();
+            HLAstartpos = HLADictionaryReader.GetStartPositions();
+            HLAstoppos = HLADictionaryReader.GetStopPositions();
+            out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
+
+            if (!filterFile.equals("")){
+                out.printf("INFO  Reading properties file ... ");
+                SimilarityFileReader similarityReader = new SimilarityFileReader();
+                similarityReader.ReadFile(filterFile);
+                ReadsToDiscard = similarityReader.GetReadsToDiscard();
+                out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
+            }
+
+
+            //Reading base likelihoods
+            if (!baseLikelihoodsFile.equals("")){
+                //Get only sites found to be different from reference
+                out.printf("INFO  Reading base likelihoods file ... ");
+                BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader();
+                baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true);
+                PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites();
+                out.printf("%s polymorphic sites found\n",PolymorphicSites.length);
+            }else{
+                //use all sites in class 1 HLA
+                out.printf("INFO  Using all positions in the classical HLA genes ... ");
+                PolymorphicSites = InitializePolymorphicSites();
+            }
+            int l = PolymorphicSites.length;
+            SNPnumInRead = new int[l];
+            SNPposInRead = new int[l];
+            numObservations = new int[l*5][l*5];
+            totalObservations = new int[l][l];
+
+            //Read allele frequencies
+            if (ethnicity.equals("Black")){
+                AlleleFrequencyFile = BlackAlleleFrequencyFile;
+            }else{
+                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
+            }
+            out.printf("INFO  Reading HLA allele frequencies ... ");
+            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
+            HLAfreqReader.ReadFile(AlleleFrequencyFile);
+            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
+            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
+
+            /*
+            PolymorphicSites = FindPolymorphicSites(HLADictionaryReader.GetMinStartPos(),HLADictionaryReader.GetMaxStopPos());
+            */
+
+        }
+        return 0;
+    }
+
+
+    public Integer map(char[] ref, SAMRecord read) {
+        if (!ReadsToDiscard.contains(read.getReadName())){
+            UpdateCorrelation(read);
+        }else{
+            
+        }
+        return 1;
+    }
+
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    public void onTraversalDone(Integer numreads) {
+        String name1, name2;
+        Double frq1, frq2, likelihood, minfrq = 0.0;
+        int numCombinations = 0;
+
+
+        if (!debugAlleles.equals("")){
+            String s[] = debugAlleles.split(",");
+            int index1 = HLADictionaryReader.GetReadIndex(s[0]);
+            int index2 = HLADictionaryReader.GetReadIndex(s[1]);
+            if (index1 > -1 && index2 > -1){
+                likelihood = CalculatePhaseLikelihood(index1,index2,true);
+            }
+        }
+
+        if (ONLYFREQUENT){
+            minfrq = 0.0001;
+        }
+
+        out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n");
+        for (int i = 0; i < HLAnames.length; i++){
+            name1 = HLAnames[i].substring(4);
+            if (AlleleFrequencies.containsKey(name1)){
+                frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString());
+                if (frq1 > minfrq){
+                    for (int j = i; j < HLAnames.length; j++){
+                        name2 = HLAnames[j].substring(4);
+                        if (name1.substring(0,1).equals(name2.substring(0,1))){
+                            if (AlleleFrequencies.containsKey(name2)){
+                                frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString());
+                                if (frq2 > minfrq){
+                                    likelihood = CalculatePhaseLikelihood(i,j,false);
+                                    numCombinations++;
+                                    out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    
+    private Integer[] InitializePolymorphicSites(){
+        int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1;
+        int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1;
+        int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1;
+        Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions];
+        for (int i = 0; i < num_A_positions; i++){
+            polymorphicSites[i]=HLA_A_start + i;
+        }
+        for (int i = 0; i < num_C_positions; i++){
+            polymorphicSites[i+num_A_positions]=HLA_C_start + i;
+        }
+        for (int i = 0; i < num_B_positions; i++){
+            polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i;
+        }
+        return polymorphicSites;
+    }
+
+    private int IndexOf(char c){
+        switch(c){
+            case 'A': return 0;
+            case 'C': return 1;
+            case 'G': return 2;
+            case 'T': return 3;
+            //case 'D': return 4;
+            default: return -1;
+        }
+    }
+    
+    private void UpdateCorrelation(SAMRecord read){
+        //Updates correlation table with SNPs from specific read (for phasing)
+        String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString());
+        ArrayList<Integer> SNPsInRead = new ArrayList<Integer>();
+        ArrayList<Integer> readindex = new ArrayList<Integer>();
+
+        int readstart = read.getAlignmentStart();
+        int readend = read.getAlignmentEnd();
+        int numPositions = PolymorphicSites.length;
+        char c1, c2;
+        int a, b, i, j, SNPcount = 0;
+        
+        //Find all SNPs in read
+        for (i = 0; i < numPositions; i++){
+            if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){
+                SNPnumInRead[i] = SNPcount;
+                SNPposInRead[i] = PolymorphicSites[i]-readstart;
+                SNPcount++;
+            }else{
+                SNPnumInRead[i] = -1;
+                SNPposInRead[i] = -1;
+            }
+        }
+
+        //Update correlation table; for each combination of SNP positions
+        for (i = 0; i < numPositions; i++){
+            if (SNPnumInRead[i] > -1){
+                c1 = s.charAt(SNPposInRead[i]);
+                if (IndexOf(c1) > -1){
+                    for (j = i+1; j < numPositions; j ++){
+                        if (SNPnumInRead[j] > -1){
+                            c2 = s.charAt(SNPposInRead[j]);
+                            if (IndexOf(c2) > -1){
+                                a = i*5 + IndexOf(c1);
+                                b = j*5 + IndexOf(c2);
+
+                                numObservations[a][b]++;
+                                totalObservations[i][j]++;
+                                if (DEBUG){
+                                    out.printf("DEBUG  %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b);
+                                }
+                            }
+                        }
+                    }
+
+                }
+            }
+        }
+    }
+
+    private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG){
+        //calculate the likelihood that the particular combination of alleles satisfies the phase count data
+        double likelihood = 0, prob = 0;
+        int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1];
+        int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2];
+        int combinedstart = Math.max(readstart1,readstart2);
+        int combinedstop  = Math.min(readend1,readend2);
+        
+        int numPositions = PolymorphicSites.length, SNPcount = 0;
+        int i, j, a1, a2, b1, b2;
+        char c11, c12, c21, c22;
+        int numInPhase = 0, numOutOfPhase;
+
+
+        //Find all SNPs in read
+        for (i = 0; i < numPositions; i++){
+            if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop){
+                SNPnumInRead[i] = SNPcount;
+                SNPposInRead[i] = PolymorphicSites[i]-combinedstart;
+                SNPcount++;
+            }else{
+                SNPnumInRead[i] = -1;
+                SNPposInRead[i] = -1;
+            }
+        }
+
+        String s1 = HLAreads[alleleIndex1];
+        String s2 = HLAreads[alleleIndex2];
+        //Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination
+        for (i = 0; i < numPositions; i++){
+            if (SNPnumInRead[i] > -1){
+                c11 = s1.charAt(SNPposInRead[i]);
+                c21 = s2.charAt(SNPposInRead[i]);
+                if (IndexOf(c11) > -1 && IndexOf(c21) > -1){
+                    for (j = i+1; j < numPositions; j ++){
+                        if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){
+                            c12 = s1.charAt(SNPposInRead[j]);
+                            c22 = s2.charAt(SNPposInRead[j]);
+                            if (IndexOf(c12) > -1 && IndexOf(c22) > -1){
+                                a1 = i*5 + IndexOf(c11);
+                                b1 = j*5 + IndexOf(c12);
+                                a2 = i*5 + IndexOf(c21);
+                                b2 = j*5 + IndexOf(c22);
+                                //check if the two alleles are identical at the chosen 2 locations
+                                if ((c11 == c21) && (c12 == c22)){
+                                    numInPhase = numObservations[a1][b1];
+                                }else{
+                                    numInPhase = numObservations[a1][b1] + numObservations[a2][b2];
+                                }
+                                prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001);
+                                likelihood += Math.log10(prob);
+                            
+                                if (PRINTDEBUG){
+                                    out.printf("DEBUG  %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%.4f\t%.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],prob,likelihood);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return likelihood;
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java
@ -6,6 +6,7 @@
 package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;


+
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -35,21 +36,39 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
    @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false)
    public boolean suppressPrinting = false;

+    @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false)
+    public boolean DEBUG = false;
+
+    @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false)
+    public String inputAlleles = "";
+
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "Caucasian";
+
+    @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
+    public String filterFile = "";
+
+    String al1 = "", al2 = "", al3 = "", al4 = "";
+
+
    //String HLAdatabaseFile = "/Users/shermanjia/Work/HLA.sam";
-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam";
+    //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam";
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
+
    //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam";
    //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.4digitUnique.sam";
    //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/CEU_Founders_HLA.freq";

    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
-
+    String AlleleFrequencyFile;


    ArrayList<String> HLAreads = new ArrayList<String>();
    ArrayList<String> HLAcigars = new ArrayList<String>();
    ArrayList<String> HLAnames = new ArrayList<String>();
    ArrayList<String> HLApositions = new ArrayList<String>();
+    ArrayList<String> ReadsToFilter = new ArrayList<String>();
    
    ArrayList<SAMRecord> AllReads = new ArrayList<SAMRecord>();
    ArrayList<String> AllReadNames = new ArrayList<String>();
@ -82,13 +101,27 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{

    boolean DatabaseLoaded = false;
    boolean PrintedOutput = false;
-    boolean DEBUG = false;

    public Pair<Long, Long> reduceInit() {
-        //Load sequences corresponding to HLA alleles from sam file

        if (!DatabaseLoaded){
            try{
+                //Load sequences corresponding to HLA alleles from sam file
+                if (!inputAlleles.equals("")){
+                    String[] str = inputAlleles.split(",");
+                    al1 = str[0];
+                    al2 = str[1];
+                    al3 = str[2];
+                    al4 = str[3];
+                }
+
+                //set ethnic group to look up allele frequencies
+                if (ethnicity.equals("Black")){
+                    AlleleFrequencyFile = BlackAlleleFrequencyFile;
+                }else{
+                    AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
+                }
+
                out.printf("Reading HLA database ...");
                FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
                DataInputStream in = new DataInputStream(fstream);
@ -140,23 +173,23 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                        CombinedAlleleFrequencies[i][j]=0.0;
                    }
                    //For debugging: get index for specific alleles
-                    if (HLAnames.get(i).equals("HLA_A*3101"))
+                    if (HLAnames.get(i).equals("HLA_" + al1))
                        j1 = i;
-                    if (HLAnames.get(i).equals("HLA_A*3201"))
+                    if (HLAnames.get(i).equals("HLA_" + al2))
                        k1 = i;
-                    if (HLAnames.get(i).equals("HLA_A*3121"))
+                    if (HLAnames.get(i).equals("HLA_" + al3))
                        j2 = i;
-                    if (HLAnames.get(i).equals("HLA_A*3207"))
+                    if (HLAnames.get(i).equals("HLA_" + al4))
                        k2 = i;
                }
                out.printf("DONE! Read %s alleles\n",HLAreads.size());
            }catch (Exception e){//Catch exception if any
-              System.err.println("Error: " + e.getMessage());
+              System.err.println("CallHLAWalker Error: " + e.getMessage());
            }

            try{
                out.printf("Reading allele frequences ...");
-                FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile);
+                FileInputStream fstream = new FileInputStream(AlleleFrequencyFile);
                DataInputStream in = new DataInputStream(fstream);
                BufferedReader br = new BufferedReader(new InputStreamReader(in));
                String strLine; String [] s = null;
@ -170,9 +203,31 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                in.close();
                out.printf("Done! Read %s alleles\n",count);
            }catch (Exception e){//Catch exception if any
-              System.err.println("Error: " + e.getMessage());
+              System.err.println("CallHLAWalker Error: " + e.getMessage());
            }

+            if (!filterFile.equals("")){
+                try{
+                    out.printf("Reading reads to filter ...");
+                    FileInputStream fstream = new FileInputStream(filterFile);
+                    DataInputStream in = new DataInputStream(fstream);
+                    BufferedReader br = new BufferedReader(new InputStreamReader(in));
+                    String strLine; String [] s = null;
+                    //Read File Line By Line
+                    int count = 0;
+                    while ((strLine = br.readLine()) != null){
+                        s = strLine.split("\\t");
+                        if (Integer.valueOf(s[6]) > 10){
+                            ReadsToFilter.add(s[0]);
+                        }
+                        count++;
+                    }
+                    in.close();
+                    out.printf("Done! %s reads to exclude\n",count);
+                }catch (Exception e){//Catch exception if any
+                  System.err.println("CallHLAWalker Error: " + e.getMessage());
+                }
+            }
            DatabaseLoaded = true;
            out.printf("Comparing reads to database ...\n");

@ -283,21 +338,27 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
            SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;

            //Check for bad bases and ensure mapping quality myself. This works.
-            for (int i = 0; i < context.getReads().size(); i++) {
-                read = context.getReads().get(i);
-                offset = context.getOffsets().get(i);
+            for (int i = 0; i < reads.size(); i++) {
+                read = reads.get(i);
+                offset = offsets.get(i);
                base = read.getReadString().charAt(offset);
                qual = read.getBaseQualities()[offset];
                mapquality = read.getMappingQuality();
                if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) {
-                    //consider base in likelihood calculations if it looks good and has high mapping score
-                    G.add(base, qual, read, offset);
-                    readname = read.getReadName();
-                    if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);}
-                    if (base == 'A'){numAs++; depth++;}
-                    else if (base == 'C'){numCs++; depth++;}
-                    else if (base == 'T'){numTs++; depth++;}
-                    else if (base == 'G'){numGs++; depth++;}
+                    if (ReadsToFilter.contains(read.getReadName())){
+                        if (DEBUG){
+                            out.printf("\n%s %s %s %s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),base);
+                        }
+                    }else{
+                        //consider base in likelihood calculations if it looks good and has high mapping score
+                        G.add(base, qual, read, offset);
+                        readname = read.getReadName();
+                        if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);}
+                        if (base == 'A'){numAs++; depth++;}
+                        else if (base == 'C'){numCs++; depth++;}
+                        else if (base == 'T'){numTs++; depth++;}
+                        else if (base == 'G'){numGs++; depth++;}
+                    }
                }
            }

@ -402,9 +463,9 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                }
                if ( DEBUG ){
                    //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes
-                    out.printf("LOD[%s%s]:%5.1fL:%5.1fLOD[%s%s]:%5.1fL:%5.1f\t",r1,r2,LOD[j1][k1],LikelihoodScores[j1][k1],s1,s2,LOD[j2][k2],LikelihoodScores[j2][k2]);
+                    out.printf("Likelihoods %s%s[%5.1f] %s%s[%5.1f]\t",r1,r2,LikelihoodScores[j1][k1],s1,s2,LikelihoodScores[j2][k2]);
                    for ( DiploidGenotype g : DiploidGenotype.values() ) {
-                        out.printf("%s %5.1f\t",g.toString(),Scores.get(g.toString()));
+                        out.printf("%s[%5.1f] ",g.toString(),Scores.get(g.toString()));
                    }
                    out.printf("\n");
                }
@ -596,10 +657,6 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
        return prob;
    }

-    public boolean isReduceByInterval() {
-        return true;
-    }
-
    public Pair<Long, Long> reduce(Integer value, Pair<Long, Long> sum) {
        long left = value.longValue() + sum.getFirst();
        long right = sum.getSecond() + 1l;
@ -653,7 +710,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
            for (Integer i = 0; i < numHLAlleles; i++){
                for (Integer j = i; j < numHLAlleles; j++){
                    if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1 && maxA > 0){
-                        if (maxA - LOD[i][j] <= 5 && maxA >= LOD[i][j]){
+                        if (maxA - LOD[i][j] <= 10 && maxA >= LOD[i][j]){
                            inverseMaxProbA = inverseMaxProbA + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA);
                            if (!TopAlleles.contains(i)){TopAlleles.add(i);}
                            if (!TopAlleles.contains(j)){TopAlleles.add(j);}
@ -662,7 +719,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                            }
                        }
                    } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1 && maxB > 0){
-                        if (maxB - LOD[i][j] <= 5 && maxB - LOD[i][j] >= 0){
+                        if (maxB - LOD[i][j] <= 10 && maxB - LOD[i][j] >= 0){
                            inverseMaxProbB = inverseMaxProbB + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB);
                            if (!TopAlleles.contains(i)){TopAlleles.add(i);}
                            if (!TopAlleles.contains(j)){TopAlleles.add(j);}
@ -671,7 +728,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                            }
                        }
                    } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1 && maxC > 0){
-                        if (maxC - LOD[i][j] <= 5 && maxC - LOD[i][j] >= 0){
+                        if (maxC - LOD[i][j] <= 10 && maxC - LOD[i][j] >= 0){
                            inverseMaxProbC = inverseMaxProbC + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC);
                            if (!TopAlleles.contains(i)){TopAlleles.add(i);}
                            if (!TopAlleles.contains(j)){TopAlleles.add(j);}
@ -754,11 +811,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
            for (Integer i = 0; i < numHLAlleles; i++){
                for (Integer j = i; j < numHLAlleles; j++){
                    if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
-                        if ((LOD[i][j] >= maxA - 5) && LOD[i][j] > 0){
+                        if ((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){
                            PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)];
                            if (PhasingScores[i][j] > maxAphase){maxAphase = PhasingScores[i][j];}
-                            alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;}
-                            alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;}
+                            alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;}
+                            alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;}
                            SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2;
                            if (CombinedAlleleFrequencies[i][j] > maxAfreq){maxAfreq = CombinedAlleleFrequencies[i][j];}
                            likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA;
@ -767,11 +824,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                            if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbA){maxProbA = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];}
                        }
                    } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){
-                        if ((LOD[i][j] >= maxB - 5) && LOD[i][j] > 0){
+                        if ((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){
                            PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)];
                            if (PhasingScores[i][j] > maxBphase){maxBphase = PhasingScores[i][j];}
-                            alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;}
-                            alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;}
+                            alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;}
+                            alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;}
                            SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2;
                            if (freq1*freq2 > maxBfreq){maxBfreq = freq1*freq2;}
                            likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB;
@ -780,12 +837,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                            if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbB){maxProbB = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];}
                        }
                    } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){
-                        if ((LOD[i][j] >= maxC - 5)&& LOD[i][j] > 0){
-
+                        if ((LOD[i][j] >= maxC - 10)&& LOD[i][j] > 0){
                            PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)];
                            if (PhasingScores[i][j] > maxCphase){maxCphase = PhasingScores[i][j];}
-                            alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;}
-                            alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;}
+                            alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;}
+                            alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;}
                            SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2;
                            if (freq1*freq2 > maxCfreq){maxCfreq = freq1*freq2;}
                            likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC;
@ -808,7 +864,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
            for (Integer i = 0; i < numHLAlleles; i++){
                for (Integer j = i; j < numHLAlleles; j++){
                    if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){
-                        if((LOD[i][j] >= maxA - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase
+                        if((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase
                            likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA;
                            //out.printf("%s\t%s\tloglikelihood=%5.0f\tmax=%5.0f\tinvP=%.2f\tPrior=%.3f\tPhase=%.3f\tfreq1=%s\tfreq2=%s\tf1*f2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodA,inverseMaxProbA,likelihoodPrior,PhasingScores[i][j],SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]/ProbSumA);
                            out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumA,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumA);
@ -816,14 +872,14 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
                            out.printf("\n");
                        }
                    } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){
-                        if((LOD[i][j] >= maxB - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase
+                        if((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase
                            likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB;
                            out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumB,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumB);
                            if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbB){out.printf("\tBEST");}
                            out.printf("\n");
                        }
                    } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){
-                        if((LOD[i][j] >= maxC - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase
+                        if((LOD[i][j] >= maxC - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase
                            likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC;
                            out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumC,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumC);
                            if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbC){out.printf("\tBEST");}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java
@ -12,10 +12,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
 *
 * @author shermanjia
 */
-public class ReadCigarFormatter {
+public class CigarParser {
+    String formattedRead;
+
+    public String GetFormattedRead(){
+        return formattedRead;
+    }
+
+
    public String FormatRead(String cigar, String read){
        // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur
-        String formattedRead = ""; char c; String count;
+        formattedRead = "";
+        
+        char c; String count;
        int cigarPlaceholder = 0; int subcigarLength = 0;
        int readPlaceholder = 0; int subreadLength = 0;

@ -43,15 +52,25 @@ public class ReadCigarFormatter {
                //increment placeholders without adding inserted bases to sequence (effectively removes insertion).
                cigarPlaceholder = i+1;
                readPlaceholder = readPlaceholder + subreadLength;
-            } else if (c == 'H' || c == 'S'){
-                //(H = Headers or S = Soft clipped removed here)***
+            } else if (c == 'H'){
+                //(H = hard clip)

-                //If reaches H for insertion, get number before 'H' and skip that many characters in sequence
+                //If reaches H for hard clip, simply carry on
                count = cigar.substring(cigarPlaceholder, i);
                subreadLength = Integer.parseInt(count);

                //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion).
                cigarPlaceholder = i+1;
+            } else if (c == 'S'){
+                //(S = Soft clipped bases discarded here)***
+
+                //If reaches S for soft clip, get number before 'S' and skip that many characters in sequence
+                count = cigar.substring(cigarPlaceholder, i);
+                subreadLength = Integer.parseInt(count);
+
+                //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion).
+                cigarPlaceholder = i+1;
+                readPlaceholder = readPlaceholder + subreadLength;
            } else if (c == 'D'){
                //If reaches D for deletion, insert 'D' into sequence as placeholder
                count = cigar.substring(cigarPlaceholder, i);
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java
@ -4,25 +4,15 @@

 package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
 import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
 import org.broadinstitute.sting.gatk.walkers.*;
-
-import java.io.FileInputStream;
-import java.io.BufferedReader;
-import java.io.DataInputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
 import java.util.Hashtable;
-import java.util.List;
-import java.lang.Math;
 /**
 *
 * @author shermanjia
 */
@Requires({DataSource.READS, DataSource.REFERENCE})
 public class CreateHaplotypesWalker extends ReadWalker<Integer, Integer> {
-    ReadCigarFormatter formatter = new ReadCigarFormatter();
+    CigarParser formatter = new CigarParser();
    char c;
    boolean DEBUG = false;
    int HLA_A_start = 30018310;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java
@ -5,142 +5,103 @@
 package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;

 import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.cmdLine.Argument;

-import java.io.FileInputStream;
-import java.io.BufferedReader;
-import java.io.DataInputStream;
-import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Hashtable;
-import java.util.List;
-import java.lang.Math;
 /**
 *
 * @author shermanjia
 */
@Requires({DataSource.READS, DataSource.REFERENCE})
 public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
-    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam";
-    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
+    public String debugRead = "";
+
+    @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false)
+    public boolean findFirst = false;
    
+    @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false)
+    public String debugAllele = "";
+    
+    @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false)
+    public String ethnicity = "Caucasian";
+
+    @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
+    public boolean ONLYFREQUENT = false;
+
+    String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam";
+    SAMFileReader HLADictionaryReader = new SAMFileReader();
+
+    String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq";
+    String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq";
+    String AlleleFrequencyFile;
+
+    String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites";
+
    boolean DatabaseLoaded = false;
    boolean DEBUG = false;

-    ArrayList<String> HLAreads = new ArrayList<String>();
-    ArrayList<String> HLAcigars = new ArrayList<String>();
-    ArrayList<String> HLAnames = new ArrayList<String>();
-    ArrayList<String> HLApositions = new ArrayList<String>();
+    String[] HLAnames, HLAreads;
+    Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites;
    double[] SingleAlleleFrequencies;

+    double[] nummatched, concordance, numcompared;
    int numHLAlleles = 0;
-    int[] HLAstartpos;
-    int[] HLAstoppos;
    int minstartpos = 0;
    int maxstoppos = 0;

    int HLA_A_start = 30018310;
    int HLA_A_end = 30021211;
    
-    ArrayList<String> PolymorphicSites = new ArrayList<String>();
-
    Hashtable AlleleFrequencies = new Hashtable();
    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
-    ReadCigarFormatter formatter = new ReadCigarFormatter();
+    CigarParser formatter = new CigarParser();

    public Integer reduceInit() { 
-    if (!DatabaseLoaded){
-            try{
-                out.printf("Reading HLA database ...\n");
-                FileInputStream fstream = new FileInputStream(HLAdatabaseFile);
-                DataInputStream in = new DataInputStream(fstream);
-                BufferedReader br = new BufferedReader(new InputStreamReader(in));
-                String strLine; String [] s = null;
-                //Read File Line By Line
-                int i = 0;
-                while ((strLine = br.readLine()) != null)   {
-                    s = strLine.split("\\t");
-                    
-                    if (s.length>=10){
-                        //Parse the reads with cigar parser
-                        HLAreads.add(formatter.FormatRead(s[5],s[9]));
-                        HLAcigars.add(s[5]);
-                        HLAnames.add(s[0]);
-                        
-                        HLApositions.add(s[3]);
-                        if (s[0].indexOf("HLA_A") > -1){
-                            if (iAstart < 0){iAstart=i;}
-                            iAstop = i; i++;
-                        }else if (s[0].indexOf("HLA_B") > -1){
-                            if (iBstart < 0){iBstart=i;}
-                            iBstop = i; i++;
-                        }else if (s[0].indexOf("HLA_C") > -1){
-                            if (iCstart < 0){iCstart=i;}
-                            iCstop = i; i++;
-                        }
-                    }
-                }
-                in.close();
-                int n = HLApositions.size(); numHLAlleles = n;
-                HLAstartpos = new int[n]; HLAstoppos = new int[n];
-                SingleAlleleFrequencies = new double[n];
-
-
-                for (i = 0; i < n; i++){
-                    //Find start and stop positions for each allele
-                    HLAstartpos[i]=Integer.parseInt(HLApositions.get(i));
-                    HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1;
-                    if (minstartpos == 0){minstartpos = HLAstartpos[i];}
-                    minstartpos = Math.min(minstartpos, HLAstartpos[i]);
-                    maxstoppos = Math.max(maxstoppos, HLAstoppos[i]);
-                    SingleAlleleFrequencies[i]=0.0;
-                    //Initialize matrix of probabilities / likelihoods
-
-                }
-                out.printf("DONE! Read %s alleles\n",HLAreads.size());
-            }catch (Exception e){//Catch exception if any
-              System.err.println("Error: " + e.getMessage());
-            }
-
-            try{
-                out.printf("Reading allele frequences ...");
-                FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile);
-                DataInputStream in = new DataInputStream(fstream);
-                BufferedReader br = new BufferedReader(new InputStreamReader(in));
-                String strLine; String [] s = null;
-                //Read File Line By Line
-                int count = 0;
-                while ((strLine = br.readLine()) != null)   {
-                    s = strLine.split("\\t");
-                    AlleleFrequencies.put(s[0].substring(0, 6), s[1]);
-                    count++;
-                }
-                in.close();
-                out.printf("Done! Read %s alleles\n",count);
-            }catch (Exception e){//Catch exception if any
-              System.err.println("Error: " + e.getMessage());
-            }
-
-            char c;
+        if (!DatabaseLoaded){
            DatabaseLoaded = true;
-            //Find polymorphic sites in dictionary
-            for (int pos = minstartpos; pos <= maxstoppos; pos++){
-                c = '0';
-                for (int i = 0; i < HLAreads.size(); i++){
-                    if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
-                        if (c == '0'){c = HLAreads.get(i).charAt(pos-HLAstartpos[i]);}
-                        if (HLAreads.get(i).charAt(pos-HLAstartpos[i]) != c){
-                            PolymorphicSites.add(String.valueOf(pos));
-                            break;
-                        }
-                    }
-                }
-            }
-            out.printf("%s polymorphic sites found in HLA dictionary\n",PolymorphicSites.size());

-            out.printf("Comparing reads to database ...\n");
+            //Load HLA dictionary
+            out.printf("INFO  Loading HLA dictionary ... ");
+
+            HLADictionaryReader.ReadFile(HLAdatabaseFile);
+            HLAreads = HLADictionaryReader.GetReads();
+            HLAnames = HLADictionaryReader.GetReadNames();
+            HLAstartpos = HLADictionaryReader.GetStartPositions();
+            HLAstoppos = HLADictionaryReader.GetStopPositions();
+            minstartpos = HLADictionaryReader.GetMinStartPos();
+            maxstoppos = HLADictionaryReader.GetMaxStopPos();
+            
+            out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length);
+
+            nummatched = new double[HLAreads.length];
+            concordance = new double[HLAreads.length];
+            numcompared = new double[HLAreads.length];
+
+            //Read allele frequencies
+            if (ethnicity.equals("Black")){
+                AlleleFrequencyFile = BlackAlleleFrequencyFile;
+            }else{
+                AlleleFrequencyFile = CaucasianAlleleFrequencyFile;
+            }
+            out.printf("INFO  Reading HLA allele frequencies ... ");
+            FrequencyFileReader HLAfreqReader = new FrequencyFileReader();
+            HLAfreqReader.ReadFile(AlleleFrequencyFile);
+            AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies();
+            out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size());
+
+            //FindPolymorphicSites(minstartpos,maxstoppos);
+            
+            PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader();
+            siteFileReader.ReadFile(PolymorphicSitesFile);
+            PolymorphicSites = siteFileReader.GetPolymorphicSites();
+            NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites();
+            
+            
+            out.printf("INFO  %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length);
+            out.printf("INFO  Comparing reads to database ...\n");

            if (DEBUG){
                //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles);
@ -149,97 +110,148 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
        return 0;
    }

+    private void FindPolymorphicSites(int start, int stop){
+        boolean initialized, polymorphic, examined;
+        char c = ' ';
+        ArrayList<Integer> polymorphicsites = new ArrayList<Integer>();
+        ArrayList<Integer> nonpolymorphicsites = new ArrayList<Integer>();
+        //Find polymorphic sites in dictionary
+        for (int pos = start; pos <= stop; pos++){
+            initialized = false; polymorphic = false; examined = false;
+            //look across all alleles at specific position to see if it is polymorphic
+            for (int i = 0; i < HLAreads.length; i++){
+                if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){
+                    if (!initialized){
+                        c = HLAreads[i].charAt(pos-HLAstartpos[i]);
+                        initialized = true;
+                        examined = true;
+                    }
+                    if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){
+                        polymorphicsites.add(pos);
+                        out.printf("POLYMORPHIC\t6\t%s\n", pos);
+                        polymorphic = true;
+                        break;
+                    }
+                }

-    public Integer map(char[] ref, SAMRecord read) {
+            }
+            if (!polymorphic && examined){
+                nonpolymorphicsites.add(pos);
+                out.printf("CONSERVED\t6\t%s\n", pos);
+            }
+        }
+        PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]);
+        NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]);
+    }
+
+    private double CalculateConcordance(SAMRecord read){
        int readstart = read.getAlignmentStart();
        int readstop = read.getAlignmentEnd();
-        double[] nummatched = new double[HLAreads.size()];
-        double[] concordance = new double[HLAreads.size()];
-        double[] numcompared = new double[HLAreads.size()];
-        double maxConcordance = 0;
-        String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
+        int numpolymorphicsites, numnonpolymorphicsites, pos;
        char c1, c2;
-        String s2 = "", name = "";
+        double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0;
+        String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString());
+        String s2;
+        int allelestart, allelestop;

-        //For every allele that overlaps with current read
-        for (int i = 0; i < HLAreads.size(); i++){
-            nummatched[i] = 0;
+        numpolymorphicsites = PolymorphicSites.length;
+        numnonpolymorphicsites = NonPolymorphicSites.length;
+
+        if (ONLYFREQUENT){
+            minFreq = 0.0001;
+        }
+
+        for (int i = 0; i < HLAreads.length; i++){
+            nummatched[i] = 0; concordance[i] = 0; numcompared[i] = 0;
+            freq = GetAlleleFrequency(HLAnames[i]);
            //Get concordance between read and specific allele
-            if (DEBUG){
-                //out.printf("%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames.get(i),readstart,readstop,HLAstartpos[i],HLAstoppos[i]);
-            }
-            if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i]){
-                s2 = HLAreads.get(i);
-                for (int j = read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++){
-                    c1 = s1.charAt(j-readstart);
-                    c2 = s2.charAt(j-HLAstartpos[i]);
-                    if (DEBUG){
-                        //out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames.get(i),j,j-readstart,j-HLAstartpos[i],c1,c2);
-                    }
-                    if (c1 != 'D'){
-                        if (PolymorphicSites.contains(String.valueOf(j))){
+            if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i] && freq > minFreq){
+                s2 = HLAreads[i];
+                
+                allelestart = HLAstartpos[i];
+                allelestop = HLAstoppos[i];
+
+                //Polymorphic sites: always increment denominator, increment numerator when bases are concordant
+                for (int j = 0; j < numpolymorphicsites; j++){
+                    pos = PolymorphicSites[j];
+                    if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
+                        c1 = s1.charAt(pos-readstart);
+                        c2 = s2.charAt(pos-allelestart);
+                        if (c1 != 'D'){
                            numcompared[i]++;
                            if (c1 == c2){
                                nummatched[i]++;
+                            }else{
+                                if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
+                                    out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
+                                }
                            }
-                        }else if (c1 != c2){
+                        }
+                    }
+                }
+
+                //Non-polymorphic sites: increment denominator only when bases are discordant
+                
+                for (int j = 0; j < numnonpolymorphicsites; j++){
+                    pos = NonPolymorphicSites[j];
+                    if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){
+                        c1 = s1.charAt(pos-readstart);
+                        c2 = s2.charAt(pos-allelestart);
+                        if (c1 != c2){
                            numcompared[i]++;
+                            if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
+                                out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2);
+                            }
                        }
                    }
                }
            }
+
+            //Update concordance array
            concordance[i]=nummatched[i]/numcompared[i];
-            
            if (concordance[i] > maxConcordance){maxConcordance = concordance[i];}
+            if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){
+                out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
+            }
+            if (findFirst && (concordance[i] == 1)){
+                break;
+            }
+        
        }
-        double freq, freq2, maxFreq = 0.0;
-        for (int i = 0; i < HLAreads.size(); i++){
+
+        return maxConcordance;
+    }
+
+    private double FindMaxAlleleFrequency(double maxConcordance){
+        //finds the max frequency of the alleles that share the maximum concordance with the read of interest
+        double freq, maxFreq = 0.0;
+        for (int i = 0; i < HLAreads.length; i++){
            if (concordance[i] == maxConcordance && maxConcordance > 0){
-                if (HLAnames.get(i).length() >= 10){
-                    name=HLAnames.get(i).substring(4,10);
-                }else{
-                    name=HLAnames.get(i).substring(4);
-                }
-                if (AlleleFrequencies.containsKey(name)){
-                    freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
-                    if (DEBUG){
-                        out.printf("%s\t%s\t%s\t%s\t%s\t%.4f\n",read.getReadName(),name,nummatched[i],numcompared[i],concordance[i],freq);
-                    }
-                }else{
-                    freq=0.0001;
-                }
+                freq = GetAlleleFrequency(HLAnames[i]);
                if (freq > maxFreq){maxFreq = freq;}
            }
        }
+        return maxFreq;
+    }

-        if (read.getReadName().length() >= 10){
-            name=read.getReadName().substring(4,10);
-        }else{
-            name=read.getReadName().substring(4);
-        }
+    public Integer map(char[] ref, SAMRecord read) {
+        //Calculate concordance for this read and all overlapping reads
+        double maxConcordance = CalculateConcordance(read);
+
+        String readname = read.getReadName(), allelename = ""; double freq;
+        //For input bam files that contain HLA alleles, find and print allele frequency
+        freq = GetAlleleFrequency(readname);
+        out.printf("%s\t%.4f", readname,freq);
+
+        //Find the maximum frequency of the alleles most concordant with the read
+        double maxFreq = FindMaxAlleleFrequency(maxConcordance);
        
-        if (AlleleFrequencies.containsKey(name)){
-            freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
-        }else{
-            freq=0.0001;
-        }
-
-        out.printf("%s\t%.4f", read.getReadName(),freq);
-        for (int i = 0; i < HLAreads.size(); i++){
+        //Print concordance statistics between this read and the most similar HLA allele(s)
+        for (int i = 0; i < HLAreads.length; i++){
            if (concordance[i] == maxConcordance && maxConcordance > 0){
-                if (HLAnames.get(i).length() >= 10){
-                    name=HLAnames.get(i).substring(4,10);
-                }else{
-                    name=HLAnames.get(i).substring(4);
-                }
-                if (AlleleFrequencies.containsKey(name)){
-                    freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString());
-                }else{
-                    freq=0.0001;
-                }
+                freq = GetAlleleFrequency(HLAnames[i]);
                if (freq == maxFreq){
-
-                    out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames.get(i),freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
+                    out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]);
                }
            }
        }
@ -247,6 +259,22 @@ public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
        return 1;
    }

+    private double GetAlleleFrequency(String allelename){
+        double frequency = 0.0;
+        //Truncate names to 4-digit "A*0101" format
+        if (allelename.length() >= 10){
+            allelename=allelename.substring(4,10);
+        }else{
+            allelename=allelename.substring(4);
+        }
+        if (AlleleFrequencies.containsKey(allelename)){
+            frequency = Double.parseDouble((String) AlleleFrequencies.get(allelename).toString());
+        }else{
+            frequency=0.0001;
+        }
+        return frequency;
+    }
+

    public Integer reduce(Integer value, Integer sum) {
        return value + sum;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java
@ -0,0 +1,39 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import java.io.*;
+import java.util.Hashtable;
+/**
+ *
+ * @author shermanjia
+ */
+public class FrequencyFileReader {
+    Hashtable AlleleFrequencies = new Hashtable();
+
+    public Hashtable GetAlleleFrequencies(){
+        return AlleleFrequencies;
+    }
+
+    public void ReadFile(String filename){
+        try{
+            FileInputStream fstream = new FileInputStream(filename);
+            DataInputStream in = new DataInputStream(fstream);
+            BufferedReader br = new BufferedReader(new InputStreamReader(in));
+            String strLine; String [] s = null;
+            //Read File Line By Line
+            while ((strLine = br.readLine()) != null)   {
+                s = strLine.split("\\t");
+                AlleleFrequencies.put(s[0], s[1]);
+                //System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString());
+            }
+            in.close();
+        }catch (Exception e){//Catch exception if any
+            System.err.println("FrequencyFileReader Error: " + e.getMessage());
+        }
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java
@ -5,8 +5,6 @@
 package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;

 import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.walkers.*;

 import java.io.FileInputStream;
@ -15,8 +13,6 @@ import java.io.DataInputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Hashtable;
-import java.util.List;
-import java.lang.Math;
 /**
 *
 * @author shermanjia
@ -53,7 +49,7 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {

    Hashtable ClosestAllele = new Hashtable();
    int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1;
-    ReadCigarFormatter formatter = new ReadCigarFormatter();
+    CigarParser formatter = new CigarParser();

    public Integer reduceInit() {
    if (!DatabaseLoaded){
@ -106,7 +102,7 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
                }
                out.printf("DONE! Read %s alleles\n",HLAreads.size());
            }catch (Exception e){//Catch exception if any
-              System.err.println("Error: " + e.getMessage());
+              System.err.println("ImputeAllelsWalker Error: " + e.getMessage());
            }

            try{
@ -126,7 +122,7 @@ public class ImputeAllelesWalker extends ReadWalker<Integer, Integer> {
                in.close();
                out.printf("Done! Read %s alleles\n",count);
            }catch (Exception e){//Catch exception if any
-              System.err.println("Error: " + e.getMessage());
+              System.err.println("ImputeAllelsWalker Error: " + e.getMessage());
            }

            char c;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java
@ -0,0 +1,49 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import java.io.*;
+import java.util.ArrayList;
+/**
+ *
+ * @author shermanjia
+ */
+public class PolymorphicSitesFileReader {
+    ArrayList<Integer> PolymorphicSites = new ArrayList<Integer>();
+    ArrayList<Integer> NonPolymorphicSites = new ArrayList<Integer>();
+
+
+    public Integer[] GetPolymorphicSites(){
+        return PolymorphicSites.toArray(new Integer[PolymorphicSites.size()]);
+    }
+
+    public Integer[] GetNonPolymorphicSites(){
+        return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]);
+    }
+
+    public void ReadFile(String filename){
+        try{
+            FileInputStream fstream = new FileInputStream(filename);
+            DataInputStream in = new DataInputStream(fstream);
+            BufferedReader br = new BufferedReader(new InputStreamReader(in));
+            String strLine; String [] s = null;
+            //Read File Line By Line
+            int i = 0;
+            while ((strLine = br.readLine()) != null)   {
+                s = strLine.split("\\t");
+                if (s[0].equals("POLYMORPHIC")){
+                    PolymorphicSites.add(Integer.valueOf(s[2]));
+                }else{
+                    NonPolymorphicSites.add(Integer.valueOf(s[2]));
+                }
+            }
+            in.close();
+        }catch (Exception e){//Catch exception if any
+            System.err.println("PolymorphicSitesFileReader Error: " + e.getMessage());
+        }
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SAMFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SAMFileReader.java
@ -0,0 +1,94 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import java.io.*;
+import java.util.ArrayList;
+/**
+ *
+ * @author shermanjia
+ */
+public class SAMFileReader {
+    ArrayList<String> ReadStrings = new ArrayList<String>();
+    ArrayList<String> CigarStrings = new ArrayList<String>();
+    ArrayList<String> ReadNames = new ArrayList<String>();
+    ArrayList<Integer> ReadStartPositions = new ArrayList<Integer>();
+    ArrayList<Integer> ReadStopPositions = new ArrayList<Integer>();
+    int minstartpos;
+    int maxstoppos;
+
+    CigarParser formatter = new CigarParser();
+
+    public String[] GetReads(){
+        return ReadStrings.toArray(new String[ReadStrings.size()]);
+    }
+
+    public String[] GetReadNames(){
+        return ReadNames.toArray(new String[ReadNames.size()]);
+    }
+
+    public String[] GetCigarStrings(){
+        return CigarStrings.toArray(new String[CigarStrings.size()]);
+    }
+
+    public Integer[] GetStartPositions(){
+        return ReadStartPositions.toArray(new Integer[ReadStartPositions.size()]);
+    }
+
+    public Integer[] GetStopPositions(){
+        return ReadStopPositions.toArray(new Integer[ReadStopPositions.size()]);
+    }
+
+    public Integer GetMinStartPos(){
+        return minstartpos;
+    }
+
+    public Integer GetMaxStopPos(){
+        return maxstoppos;
+    }
+
+    public int GetReadIndex(String readname){
+        if (ReadNames.contains(readname)){
+            return ReadNames.indexOf(readname);
+        }else{
+            return -1;
+        }
+    }
+
+    public void ReadFile(String filename){
+        try{
+            FileInputStream fstream = new FileInputStream(filename);
+            DataInputStream in = new DataInputStream(fstream);
+            BufferedReader br = new BufferedReader(new InputStreamReader(in));
+            String strLine; String [] s = null;
+            //Read File Line By Line
+            int i = 0;
+            while ((strLine = br.readLine()) != null)   {
+                s = strLine.split("\\t");
+                if (s.length>=10){
+                    //Parse the reads with cigar parser
+                    String read = formatter.FormatRead(s[5],s[9]);
+                    ReadStrings.add(read);
+                    CigarStrings.add(s[5]);
+                    ReadNames.add(s[0]);
+                    ReadStartPositions.add(Integer.valueOf(s[3]));
+                    ReadStopPositions.add(Integer.valueOf(s[3]) + read.length() - 1);
+                    if (i == 0){
+                        minstartpos = Integer.valueOf(s[3]);
+                        maxstoppos = Integer.valueOf(Integer.valueOf(s[3]) + read.length() - 1);
+                    }
+                    minstartpos = Math.min(minstartpos, Integer.valueOf(s[3]));
+                    maxstoppos = Math.max(maxstoppos, Integer.valueOf(Integer.valueOf(s[3]) + read.length() - 1));
+                    i++;
+                }
+            }
+            in.close();
+        }catch (Exception e){//Catch exception if any
+            System.err.println("SAMFileReader Error: " + e.getMessage());
+        }
+    }
+}
+
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java
@ -0,0 +1,49 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller;
+
+import java.io.*;
+import java.util.ArrayList;
+/**
+ *
+ * @author shermanjia
+ */
+public class SimilarityFileReader {
+    ArrayList<String> ReadsToDiscard = new ArrayList<String>();
+
+    public ArrayList<String> GetReadsToDiscard(){
+        return ReadsToDiscard;
+    }
+
+    public String[] GetReadsToDiscardArray(){
+        return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]);
+    }
+
+    public void ReadFile(String filename){
+        try{
+            FileInputStream fstream = new FileInputStream(filename);
+            DataInputStream in = new DataInputStream(fstream);
+            BufferedReader br = new BufferedReader(new InputStreamReader(in));
+            String strLine; String [] s = null;
+            //Read File Line By Line
+            int i = 0;
+            while ((strLine = br.readLine()) != null)   {
+                s = strLine.split("\\t");
+                if (s.length >= 6){
+                    Double matchFraction = Double.valueOf(s[4]);
+                    int numMismatches = Integer.valueOf(s[6]);
+                    if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches >= 6)){
+                        ReadsToDiscard.add(s[0]);
+                    }
+                }
+            }
+            in.close();
+        }catch (Exception e){//Catch exception if any
+            System.err.println("SimilarityFile Error: " + e.getMessage());
+        }
+    }
+}
+