diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java index 34e4bf312..093460acf 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java @@ -33,7 +33,9 @@ import org.broadinstitute.sting.commandline.Argument; import java.util.ArrayList; import java.util.Hashtable; - +import java.util.Enumeration; +import java.util.Vector; +import java.util.Collections; /** * Calculates likelihood of observing the data given pairs of HLA alleles. NOTE: run CalculateBaseLikelihoods first! Usage: java -jar GenomeAnalysisTK.jar -T CalculateAlleleLikelihoods -I /humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl INPUT.baselikelihoods -eth\ @@ -60,17 +62,20 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker HLAnamesAL, HLAreadsAL; + String[] HLAnames, HLAreads, HLAnames2, HLAreads2; + Integer[] HLAstartpos, HLAstoppos, HLAstartpos2, HLAstoppos2; + ArrayList HLAnamesAL, HLAreadsAL, Loci, AllelesToSearch; ArrayList HLAstartposAL, HLAstopposAL; public Integer reduceInit() { @@ -86,21 +91,27 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker(); HLAstopposAL = new ArrayList(); - if (!ethnicity.equals("CaucasianUSA")){ - AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq"; - } - out.printf("INFO Reading HLA allele frequencies ... "); - FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); - HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile); - AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); - UniqueAlleles = HLAfreqReader.GetUniqueAlleles(); - out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); + out.printf("INFO Reading HLA alleles ... "); + HLAFileReader HLADictionaryReader = new HLAFileReader(); + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetSequences(); + HLAnames = HLADictionaryReader.GetNames(); + HLAstartpos = HLADictionaryReader.GetStartPositions(); + HLAstoppos = HLADictionaryReader.GetStopPositions(); + + HLADictionaryReader = new HLAFileReader(); + HLADictionaryReader.ReadFile(HLA2DigitFile); + HLAreads2 = HLADictionaryReader.GetSequences(); + HLAnames2 = HLADictionaryReader.GetNames(); + HLAstartpos2 = HLADictionaryReader.GetStartPositions(); + HLAstoppos2 = HLADictionaryReader.GetStopPositions(); + out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); //out.printf("INFO Common alleles:\n"); for (int i = 1; i < UniqueAlleles.size(); i++){ //out.printf("INFO %s\n",UniqueAlleles.values().toArray()[i]); } - out.printf("INFO Reading HLA dictionary ..."); + //out.printf("INFO Reading HLA dictionary ..."); } @@ -108,10 +119,11 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker -1 && index2 > -1){ out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2); - double dl = CalculateLikelihood(index1,index2,true); + double dl = CalculateLikelihood(index1,index2,HLAreads2,true); break; } } } - + + //Pre-process homozygous combinations to determine top possible alleles (for efficiency) + int numreads2 = HLAnames2.length; + Alleles2Digit = new Hashtable(); + Loci = new ArrayList(); + double[] AlleleLikelihoods2 = new double[numreads]; for (int i = 0; i < numreads; i++){ name1 = HLAnames[i].substring(4); String [] n1 = name1.split("\\*"); -// out.printf("1: %s\n",n1[0] + "*" + n1[1].substring(0, 3)); - if (UniqueAlleles.containsKey(n1[0] + "*" + n1[1].substring(0, 4))){ - //out.printf("1: %s\n",name1); - //frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString()); - //if (frq1 > minfrq){ + numcombinations++; + AlleleLikelihoods2[i] = CalculateLikelihood(i,i,HLAreads,false); + if (AlleleLikelihoods2[i] < 0){ + name2 = n1[0] + "*" + n1[1].substring(0, 2); + if (!Loci.contains(n1[0])){Loci.add(n1[0]);} + if (!Alleles2Digit.containsKey(name2)){ + Alleles2Digit.put(name2, AlleleLikelihoods2[i]); + }else if ((Double) Alleles2Digit.get(name2) < AlleleLikelihoods2[i]){ + Alleles2Digit.put(name2, AlleleLikelihoods2[i]); + } + } + } + + //Sort alleles at 2 digit resolution for each locus + AllelesToSearch = new ArrayList(); + for (int i = 0; i < Loci.size(); i++){ + Enumeration k = Alleles2Digit.keys(); + Hashtable AllelesAtLoci = new Hashtable(); + + //find alleles at the locus + while( k.hasMoreElements() ){ + name1 = k.nextElement().toString(); + String [] n1 = name1.split("\\*"); + if (Loci.get(i).equals(n1[0])){ + AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1); + } + } + + //Sort alleles at locus, mark top six 2-digit classes for deep search + int num = 1; + Vector v = new Vector(AllelesAtLoci.keySet()); + Collections.sort(v); + for (Enumeration e = v.elements(); e.hasMoreElements();) { + Double key = Double.valueOf(e.nextElement().toString()); + String allele = AllelesAtLoci.get(key).toString(); + if (num <= 10){ + AllelesToSearch.add(allele); + + num++; + } + //out.printf("%s\t%s\n",allele,key); + } + } + + //Iterate through allele pairs to calculate likelihoods + if (true){ + numcombinations = 0; + for (int i = 0; i < numreads; i++){ + name1 = HLAnames[i].substring(4); + String [] n1 = name1.split("\\*"); + if (AllelesToSearch.contains(n1[0] + "*" + n1[1].substring(0, 2))){ + //out.printf("1: %s\n",name1); + //frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString()); + //if (frq1 > minfrq){ for (int j = i; j < numreads; j++){ name2 = HLAnames[j].substring(4); String [] n2 = name2.split("\\*"); -// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3)); - if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){ - - // frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString()); - // if (frq2 > minfrq){ - if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){ - numcombinations++; - AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,false); + if (AllelesToSearch.contains(n2[0] + "*" + n2[1].substring(0, 2))){ + if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){ + numcombinations++; + AlleleLikelihoods[i][j] = CalculateLikelihood(i,j,HLAreads,false); + if (AlleleLikelihoods[i][j] < 0){ out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]); } - // }else{ - // if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);} - // } - // }else{ - // if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);} + } } } - //}else{ - // if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);} - //} - //}else{ - // if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);} + } } } } - private double CalculateLikelihood(int a1, int a2, boolean debug){ - String read1 = HLAreads[a1]; - String read2 = HLAreads[a2]; + private double CalculateLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){ + //Calculates likelihood for specific allele pair + String read1 = HLAalleles[a1]; + String read2 = HLAalleles[a2]; int start1 = HLAstartpos[a1]; int start2 = HLAstartpos[a2]; int stop1 = HLAstoppos[a1]; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java index 94baaa486..27651cd6f 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java @@ -72,34 +72,13 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker AllReads = new ArrayList(); ArrayList AllReadNames = new ArrayList(); - boolean HLAdataLoaded = false; + boolean dataLoaded = false; - //Loads HLA dictionary, allele frequencies, and reads to filter + //Loads reads to filter public Pair reduceInit() { - if (!HLAdataLoaded){ - HLAdataLoaded = true; - - out.printf("INFO Reading HLA database ... "); - HLADictionaryReader.ReadFile(HLAdatabaseFile); - HLAreads = HLADictionaryReader.GetReads(); - HLAnames = HLADictionaryReader.GetReadNames(); - HLAstartpos = HLADictionaryReader.GetStartPositions(); - HLAstoppos = HLADictionaryReader.GetStopPositions(); - InitializeVariables(HLAreads.length); - out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); - - - - if (!ethnicity.equals("CaucasianUSA")){ - AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_" + ethnicity + ".freq"; - } - out.printf("INFO Reading HLA allele frequencies ... "); - FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); - HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile); - AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); - out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); - + if (!dataLoaded){ + dataLoaded = true; if (!filterFile.equals("")){ out.printf("INFO Reading properties file ... "); @@ -137,7 +116,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker { @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) public String ethnicity = "Caucasian"; + + @Argument(fullName = "useInterval", shortName = "useInterval", doc = "Use only these intervals", required = false) + public String intervalFile = ""; @Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false) public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam"; @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) public boolean ONLYFREQUENT = false; + + String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt"; + String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt"; + String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt"; - SAMFileReader HLADictionaryReader = new SAMFileReader(); - - String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq"; - String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; - String AlleleFrequencyFile; - String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; - - String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; + HLAFileReader HLADictionaryReader = new HLAFileReader(); boolean DatabaseLoaded = false; boolean DEBUG = false; + ArrayList ClosestAlleles = new ArrayList(); String[] HLAnames, HLAreads; Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; @@ -88,6 +89,7 @@ public class FindClosestAlleleWalker extends ReadWalker { Hashtable AlleleFrequencies = new Hashtable(); int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; CigarParser formatter = new CigarParser(); + int [][] intervals; int numIntervals; public Integer reduceInit() { if (!DatabaseLoaded){ @@ -96,9 +98,9 @@ public class FindClosestAlleleWalker extends ReadWalker { //Load HLA dictionary out.printf("INFO Loading HLA dictionary ... "); - HLADictionaryReader.ReadFile(HLAdictionaryFile); - HLAreads = HLADictionaryReader.GetReads(); - HLAnames = HLADictionaryReader.GetReadNames(); + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetSequences(); + HLAnames = HLADictionaryReader.GetNames(); HLAstartpos = HLADictionaryReader.GetStartPositions(); HLAstoppos = HLADictionaryReader.GetStopPositions(); minstartpos = HLADictionaryReader.GetMinStartPos(); @@ -110,26 +112,27 @@ public class FindClosestAlleleWalker extends ReadWalker { concordance = new double[HLAreads.length]; numcompared = new double[HLAreads.length]; - //Read allele frequencies - if (ethnicity.equals("Black")){ - AlleleFrequencyFile = BlackAlleleFrequencyFile; - }else{ - AlleleFrequencyFile = CaucasianAlleleFrequencyFile; - } - out.printf("INFO Reading HLA allele frequencies ... "); - FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); - HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile); - AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); - out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); - - //FindPolymorphicSites(minstartpos,maxstoppos); - + //Load list of polymorphic sites PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader(); siteFileReader.ReadFile(PolymorphicSitesFile); PolymorphicSites = siteFileReader.GetPolymorphicSites(); NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites(); numpolymorphicsites = PolymorphicSites.length; numnonpolymorphicsites = NonPolymorphicSites.length; + + if (!intervalFile.equals("")){ + TextFileReader fileReader = new TextFileReader(); + fileReader.ReadFile(intervalFile); + String[] lines = fileReader.GetLines(); + intervals = new int[lines.length][2]; + for (int i = 0; i < lines.length; i++) { + String[] s = lines[i].split(":"); + String[] intervalPieces = s[1].split("-"); + intervals[i][0] = Integer.valueOf(intervalPieces[0]); + intervals[i][1] = Integer.valueOf(intervalPieces[1]); + } + numIntervals = intervals.length; + } out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); out.printf("INFO Comparing reads to database ...\n"); @@ -167,7 +170,7 @@ public class FindClosestAlleleWalker extends ReadWalker { //Polymorphic sites: always increment denominator, increment numerator when bases are concordant for (int j = 0; j < numpolymorphicsites; j++){ pos = PolymorphicSites[j]; - if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ + if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){ c1 = s1.charAt(pos-readstart); c2 = s2.charAt(pos-allelestart); if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors) @@ -176,7 +179,7 @@ public class FindClosestAlleleWalker extends ReadWalker { nummatched[i]++; }else{ if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); + out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, pos,c1,c2); } } } @@ -187,13 +190,13 @@ public class FindClosestAlleleWalker extends ReadWalker { if (numcompared[i] > 0){ for (int j = 0; j < numnonpolymorphicsites; j++){ pos = NonPolymorphicSites[j]; - if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ + if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop && IsWithinInterval(pos)){ c1 = s1.charAt(pos-readstart); c2 = s2.charAt(pos-allelestart); if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors) numcompared[i]++; if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); + out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); } } } @@ -205,7 +208,7 @@ public class FindClosestAlleleWalker extends ReadWalker { concordance[i]=nummatched[i]/numcompared[i]; if (concordance[i] > maxConcordance){maxConcordance = concordance[i];} if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]); + out.printf("DEBUG\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]); } if (findFirst && (concordance[i] == 1)){ break; @@ -228,32 +231,42 @@ public class FindClosestAlleleWalker extends ReadWalker { return maxFreq; } + private boolean IsWithinInterval(int pos){ + boolean isWithinInterval = false; + for (int i = 0; i < numIntervals; i++){ + if (pos >= intervals[i][0] && pos <= intervals[i][1]){ + isWithinInterval = true; + break; + } + } + return isWithinInterval; + } + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { //Calculate concordance for this read and all overlapping reads if (read.getMappingQuality() > 0){ double maxConcordance = CalculateConcordance(read); - + String stats = "", topAlleles = ""; if (maxConcordance > 0){ String readname = read.getReadName(), allelename = ""; double freq; //For input bam files that contain HLA alleles, find and print allele frequency - //freq = GetAlleleFrequency(readname); out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd()); - //Find the maximum frequency of the alleles most concordant with the read - //double maxFreq = FindMaxAlleleFrequency(maxConcordance); - //Print concordance statistics between this read and the most similar HLA allele(s) for (int i = 0; i < HLAreads.length; i++){ if (concordance[i] == maxConcordance){ - freq = GetAlleleFrequency(HLAnames[i]); - //if (freq == maxFreq){ - out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); - //} - break; + //freq = GetAlleleFrequency(HLAnames[i]); + if (topAlleles.equals("")){ + topAlleles = HLAnames[i]; + }else{ + topAlleles = topAlleles + "," + HLAnames[i]; + } + stats = String.format("%.1f\t%.3f\t%.0f\t%.0f",1.0,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); + } } - out.print("\n"); + out.printf("\t%s\t%s\n",stats,topAlleles); } } return 1; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java index 123b6b923..668e62a9e 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java @@ -29,18 +29,12 @@ public class FindPolymorphicSitesWalker extends ReadWalker { @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) public boolean ONLYFREQUENT = false; - //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam"; - - SAMFileReader HLADictionaryReader = new SAMFileReader(); + String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt"; + String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; - //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; - String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq"; - String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; - String AlleleFrequencyFile; - String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; - - String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; + String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt"; + String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt"; + HLAFileReader HLADictionaryReader = new HLAFileReader(); boolean DatabaseLoaded = false; boolean DEBUG = false; @@ -69,8 +63,8 @@ public class FindPolymorphicSitesWalker extends ReadWalker { out.printf("INFO Loading HLA dictionary ... "); HLADictionaryReader.ReadFile(HLAdatabaseFile); - HLAreads = HLADictionaryReader.GetReads(); - HLAnames = HLADictionaryReader.GetReadNames(); + HLAreads = HLADictionaryReader.GetSequences(); + HLAnames = HLADictionaryReader.GetNames(); HLAstartpos = HLADictionaryReader.GetStartPositions(); HLAstoppos = HLADictionaryReader.GetStopPositions(); minstartpos = HLADictionaryReader.GetMinStartPos(); @@ -82,18 +76,6 @@ public class FindPolymorphicSitesWalker extends ReadWalker { concordance = new double[HLAreads.length]; numcompared = new double[HLAreads.length]; - //Read allele frequencies - if (ethnicity.equals("Black")){ - AlleleFrequencyFile = BlackAlleleFrequencyFile; - }else{ - AlleleFrequencyFile = CaucasianAlleleFrequencyFile; - } - out.printf("INFO Reading HLA allele frequencies ... "); - FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); - HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile); - AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); - out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); - FindPolymorphicSites(minstartpos,maxstoppos); out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); @@ -108,13 +90,15 @@ public class FindPolymorphicSitesWalker extends ReadWalker { private void FindPolymorphicSites(int start, int stop){ boolean initialized, polymorphic, examined; - char c = ' '; + char c = ' ', ch = ' '; + int A = 0, C = 0, G = 0, T = 0; ArrayList polymorphicsites = new ArrayList(); ArrayList nonpolymorphicsites = new ArrayList(); //Find polymorphic sites in dictionary for (int pos = start; pos <= stop; pos++){ initialized = false; polymorphic = false; examined = false; //look across all alleles at specific position to see if it is polymorphic + A = 0; C = 0; G = 0; T = 0; for (int i = 0; i < HLAreads.length; i++){ if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){ if (!initialized){ @@ -122,19 +106,28 @@ public class FindPolymorphicSitesWalker extends ReadWalker { initialized = true; examined = true; } - if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){ - polymorphicsites.add(pos); - out.printf("POLYMORPHIC\t6\t%s\n", pos); + ch = HLAreads[i].charAt(pos-HLAstartpos[i]); + if (ch == 'A'){A++;} + else if (ch == 'C'){C++;} + else if (ch == 'T'){T++;} + else if (ch == 'G'){G++;} + + if (ch != c){ + // polymorphicsites.add(pos); + // out.printf("POLYMORPHIC\t6\t%s\n", pos); polymorphic = true; - break; + // break; } } + } + if (polymorphic){ + out.printf("%s\t%s\t%s\t%s\t%s\n",pos,A,C,G,T); + } + //if (!polymorphic && examined){ + // nonpolymorphicsites.add(pos); + // out.printf("CONSERVED\t6\t%s\n", pos); + //} - } - if (!polymorphic && examined){ - nonpolymorphicsites.add(pos); - out.printf("CONSERVED\t6\t%s\n", pos); - } } PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]); NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java index 7f18c94ed..ced40a410 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java @@ -7,41 +7,68 @@ import java.util.Hashtable; * @author shermanjia */ public class FrequencyFileReader { - Hashtable AlleleFrequencies = new Hashtable(); - Hashtable UniqueAlleles = new Hashtable(); + Hashtable MaxFrequencies = new Hashtable(); + Hashtable CommonAlleles = new Hashtable(); + Hashtable [] AlleleFrequencies = null; + String [] Populations = null; - public Hashtable GetAlleleFrequencies(){ + public Hashtable [] GetAlleleFrequencies(){ + //return allele frequencies for all populations return AlleleFrequencies; } - public Hashtable GetUniqueAlleles(){ - return UniqueAlleles; + public Hashtable GetCommonAlleles(){ + //return list of common alleles + return CommonAlleles; } - public void ReadFile(String filename, String uniqueAllelesFile){ + + public Hashtable GetMaxFrequencies(){ + //return list of common alleles + return MaxFrequencies; + } + + public String[] GetPopulations(){ + //Return name of populations + return Populations; + } + + public void ReadFile(String filename, String ethnicity){ try{ + int linenum = 0; FileInputStream fstream = new FileInputStream(filename); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine; String [] s = null; //Read File Line By Line while ((strLine = br.readLine()) != null) { + linenum++; s = strLine.split("\\t"); - AlleleFrequencies.put(s[0], s[1]); - //System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString()); - } - in.close(); - - fstream = new FileInputStream(uniqueAllelesFile); - in = new DataInputStream(fstream); - br = new BufferedReader(new InputStreamReader(in)); - //Read File Line By Line - while ((strLine = br.readLine()) != null) { - UniqueAlleles.put(strLine,strLine); - //System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString()); + if (linenum == 1){ + //Determine number of populations, create a hash table for each population + AlleleFrequencies = new Hashtable[s.length-1]; + Populations = new String[s.length-1]; + for (int i = 1; i < s.length; i++){ + Populations[i-1]=s[i]; + AlleleFrequencies[i-1] = new Hashtable(); + } + }else{ + //assign allele frequencies for each population + for (int i = 1; i < s.length; i++){ + if (Double.valueOf(s[i]) > 0.0001){ + CommonAlleles.put(s[0], s[0]); + } + AlleleFrequencies[i-1].put(s[0],s[i]); + if (!MaxFrequencies.containsKey(s[0])){ + MaxFrequencies.put(s[0], s[i]); + }else if (Double.valueOf(MaxFrequencies.get(s[0]).toString()) < Double.valueOf(s[i])){ + MaxFrequencies.put(s[0], s[i]); + } + } + } } in.close(); }catch (Exception e){//Catch exception if any - System.err.println("FrequencyFileReader Error: " + e.getMessage()); + System.err.println("Exception in FrequencyFileReader (" + e.getMessage() + ")."); } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java new file mode 100644 index 000000000..f4917afb3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.commandline.Argument; + +import java.util.*; +import java.util.Map.Entry; + +/** + * Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\ +dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods + * @author shermanjia + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class HLACallerWalker extends ReadWalker { + @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true) + public String baseLikelihoodsFile = ""; + + @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) + public boolean DEBUG = false; + + @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) + public String filterFile = ""; + + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "CaucasiansUSA"; + + @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) + public String debugAlleles = ""; + + @Argument(fullName = "phaseInterval", shortName = "phaseInterval", doc = "Use only these intervals in phase calculation", required = false) + public String phaseIntervalFile = ""; + + @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) + public boolean ONLYFREQUENT = false; + + @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) + public int MINALLOWEDMISMATCHES = 7; + + GATKArgumentCollection args = this.getToolkit().getArguments(); + + String AlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_FREQUENCIES.txt"; + String PolymorphicSitesfile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_POLYMORPHIC_SITES.txt"; + String HLAdatabaseFile = "/humgen/gsa-scr1/GSA/sjia/HLA_CALLER/HLA_DICTIONARY.txt"; + + // Initializing variables + + HLAFileReader HLADictionaryReader = new HLAFileReader(); + boolean HLAdataLoaded = false; + String[] HLAnames, HLAreads, Populations; + ArrayList ReadsToDiscard; + Integer[] HLAstartpos, HLAstoppos, PolymorphicSites; + + + int[][] numObservations, totalObservations, intervals; + int[] SNPnumInRead, SNPposInRead, positions; + CigarParser cigarparser = new CigarParser(); + Hashtable MaxLikelihoods = new Hashtable(); + Hashtable MaxFrequencies, CommonAlleles, AlleleCount, LocusCount; + Hashtable[] AlleleFrequencies; + int numIntervals; + double[][] baseLikelihoods; + + ArrayList AllelesToSearch = new ArrayList(); + + // setting error rates for phasing algorithm (1% expected error rate for any genotype) + + double P_err = 0.01; + double P_correct = 1 - P_err; + double L_err = Math.log10(P_err); + double L_correct = Math.log10(P_correct); + + public Integer reduceInit() { + + if (!HLAdataLoaded){ + HLAdataLoaded = true; + + //Load HLA dictionary + + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetSequences(); + HLAnames = HLADictionaryReader.GetNames(); + HLAstartpos = HLADictionaryReader.GetStartPositions(); + HLAstoppos = HLADictionaryReader.GetStopPositions(); + + //Load pre-processing file for misaligned reads and list of alleles to search + + SimilarityFileReader similarityReader = new SimilarityFileReader(); + similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); + ReadsToDiscard = similarityReader.GetReadsToDiscard(); + AllelesToSearch = similarityReader.GetAllelesToSearch(); + AlleleCount = similarityReader.GetAlleleCount(); + LocusCount = similarityReader.GetLocusCount(); + for (int i = 0; i < AllelesToSearch.size(); i++){ + out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i))); + } + + //Load genotypes and find polymorphic sites (sites that differ from reference) + + BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader(); + baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true); + baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods(); + positions = baseLikelihoodsReader.GetPositions(); + PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites(); + out.printf("INFO\t%s polymorphic sites found\n",PolymorphicSites.length); + + int l = PolymorphicSites.length; + SNPnumInRead = new int[l]; + SNPposInRead = new int[l]; + numObservations = new int[l*5][l*5]; + totalObservations = new int[l][l]; + + //Load allele frequencies for different populations + + FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); + HLAfreqReader.ReadFile(AlleleFrequencyFile,ethnicity); + AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); + MaxFrequencies = HLAfreqReader.GetMaxFrequencies(); + CommonAlleles = HLAfreqReader.GetCommonAlleles(); + Populations = HLAfreqReader.GetPopulations(); + + //Load genomic intervals for bam file + + if (!phaseIntervalFile.equals("")){ + TextFileReader fileReader = new TextFileReader(); + fileReader.ReadFile(phaseIntervalFile); + String[] lines = fileReader.GetLines(); + intervals = new int[lines.length][2]; + for (int i = 0; i < lines.length; i++) { + String[] s = lines[i].split(":"); + String[] intervalPieces = s[1].split("-"); + intervals[i][0] = Integer.valueOf(intervalPieces[0]); + intervals[i][1] = Integer.valueOf(intervalPieces[1]); + } + numIntervals = intervals.length; + } + + + } + return 0; + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if (!ReadsToDiscard.contains(read.getReadName())){ + UpdateCorrelation(read); + }else{ + + } + return 1; + } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraversalDone(Integer numreads) { + String name1, name2, d4_name1, d4_name2, d2_name1, d2_name2; + Double frq1 = 0.0, frq2 = 0.0, log1 = 0.0, log2 = 0.0,alleleLikelihood= 0.0, phaseLikelihood=0.0, minfrq = 0.0, likelihood = 0.0; + int numCombinations = 0; + + //For debugging specific alleles + if (!debugAlleles.equals("")){ + String s[] = debugAlleles.split(","); + int index1 = HLADictionaryReader.GetIndex(s[0]); + int index2 = HLADictionaryReader.GetIndex(s[1]); + out.printf("INFO: debugging %s\t%s\t%s\t%s\n",s[0],s[1],index1,index2); + if (index1 > -1 && index2 > -1){ + alleleLikelihood = CalculateAlleleLikelihood(index1,index2,HLAreads,true); + phaseLikelihood = CalculatePhaseLikelihood(index1,index2,true,false); + } + } + + if (ONLYFREQUENT){ + minfrq = 0.0001; + } + + double max; + ArrayList Output = new ArrayList(); + ArrayList Likelihoods = new ArrayList(); + Hashtable TotalProb = new Hashtable(); + //Search pairs of alleles that satisfy initial search criteria + + // Allele 1 + + for (int i = 0; i < HLAnames.length; i++){ + name1 = HLAnames[i].substring(4); + String [] n1 = name1.split("\\*"); + d4_name1 = n1[0] + "*" + n1[1].substring(0, 4); + d2_name1 = n1[0] + "*" + n1[1].substring(0, 2); + if (AllelesToSearch.contains(d4_name1)){// || CommonAlleles.contains(d4_name1) + + if (MaxFrequencies.containsKey(d4_name1)){ + frq1 = Double.parseDouble(MaxFrequencies.get(d4_name1).toString()); + }else{ + if (n1[1].length() > 4){if (n1[1].substring(4, 5).equals("N")){frq1 = .00000005;}else{frq1 = .000001;}}else{frq1 = .000001;} + } + + // Allele 2 + + for (int j = i; j < HLAnames.length; j++){ + name2 = HLAnames[j].substring(4); + String [] n2 = name2.split("\\*"); + d4_name2 = n2[0] + "*" + n2[1].substring(0, 4); + d2_name2 = n2[0] + "*" + n2[1].substring(0, 2); + if (n1[0].equals(n2[0]) && (AllelesToSearch.contains(d4_name2))){// || CommonAlleles.contains(d4_name2) + if (MaxFrequencies.containsKey(d4_name2)){ + frq2 = Double.parseDouble(MaxFrequencies.get(d4_name2).toString()); + }else{ + if (n2[1].length() > 4){if (n2[1].substring(4, 5).equals("N")){frq2 = .00000005;}else{frq2 = .000001;}}else{frq2 = .000001;} + } + //Calculate allele and phase likelihoods for each allele pair + alleleLikelihood = CalculateAlleleLikelihood(i,j,HLAreads,false); + numCombinations++; + + //If there is data at the allele pair, continue with other calculations + + if (alleleLikelihood < 0){ + phaseLikelihood = CalculatePhaseLikelihood(i,j,false,false); + log1=Math.log10(frq1); + log2=Math.log10(frq2); + + //sum likelihoods + + likelihood = alleleLikelihood+phaseLikelihood+log1+log2; + if (!MaxLikelihoods.containsKey(n1[0])){MaxLikelihoods.put(n1[0], likelihood);} + + if (likelihood > (Double) MaxLikelihoods.get(n1[0])) { + MaxLikelihoods.put(n1[0], likelihood); + } + Likelihoods.add(likelihood); + String data = String.format("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f",n1[0],name1,name2,alleleLikelihood,phaseLikelihood,log1,log2,likelihood); + Output.add(data); + out.printf("INFO\t%s\n",data); + if (DEBUG){ + + } + } + } + } + } + } + //Print output + out.printf("Locus\tA1\tA2\tGeno\tPhase\tFrq1\tFrq2\tL\tProb\tReads1\tReads2\tLocus\tEXP"); + for (int i = 0; i < Populations.length; i++){ + out.printf("\t%s",Populations[i]); + } + out.printf("\n"); + + //Calculate probabilities for each locus + Double probSum = 0.0, prob = 0.0, f1 = 0.0, f2 = 0.0, aLikelihood4 = 0.0, pLikelihood4 = 0.0; + Integer count = 0; + Hashtable HLA4DigitProbs = new Hashtable(); + Hashtable HLA4DigitLs = new Hashtable(); + Hashtable HLA4DigitCount = new Hashtable(); + Hashtable HLA4DigitF1 = new Hashtable(); + Hashtable HLA4DigitF2 = new Hashtable(); + Hashtable HLA4DigitA = new Hashtable(); + Hashtable HLA4DigitP = new Hashtable(); + + String key; + Enumeration keys = LocusCount.keys(); + while (keys.hasMoreElements()){ + String locus = keys.nextElement().toString(); + + probSum = 0.0; + ArrayList localOutput = new ArrayList(); + ArrayList localLikelihoods = new ArrayList(); + + //Sum probabilities for each locus + + for (int j = 0; j < Output.size(); j++){ + String data = Output.get(j).toString(); + String [] d = data.split("\\t"); + if (d[0].equals(locus)){ + localOutput.add(data); + likelihood = (Double)Likelihoods.get(j)-(Double)MaxLikelihoods.get(locus); + localLikelihoods.add(likelihood); + probSum = probSum + Math.pow(10, likelihood); + //out.printf("INFO\t%s\t%s\t%.2f\t%.2f\t%.2f\t%s\n",locus,data,likelihood,(Double)MaxLikelihoods.get(locus),(Double)Likelihoods.get(j),probSum); + } + } + + //aggregate statistics for 4-digit types + + String A1 = "", A2 = "", a1 = "", a2 = ""; + String [] s, s1, s2; + Double prob4digit = 0.0; + int n = 0; + + for (int j = 0; j < localOutput.size(); j++){ + String data = localOutput.get(j).toString(); + prob = Math.pow(10, (Double)localLikelihoods.get(j))/probSum; + + if (prob > 0.005){ + s = data.split("\\t"); + s1 = s[1].split("\\*"); + s2 = s[2].split("\\*"); + a1 = s1[0] + "*" + s1[1].substring(0,4); + a2 = s2[0] + "*" + s2[1].substring(0,4); + key = a1 + "," + a2; + aLikelihood4 = Double.valueOf(s[3]); + pLikelihood4 = Double.valueOf(s[4]); + likelihood = aLikelihood4 + pLikelihood4 + f1 + f2; + f1 = Double.valueOf(s[5]); + f2 = Double.valueOf(s[6]); + if (!HLA4DigitProbs.containsKey(key)){ + HLA4DigitProbs.put(key, prob); + HLA4DigitLs.put(key, likelihood); + HLA4DigitCount.put(key, 1); + HLA4DigitF1.put(key,f1); + HLA4DigitF2.put(key,f2); + HLA4DigitA.put(key,aLikelihood4); + HLA4DigitP.put(key,pLikelihood4); + }else{ + prob = prob + Double.valueOf(HLA4DigitProbs.get(key).toString()); + HLA4DigitProbs.put(key, prob); + likelihood = likelihood + Double.valueOf(HLA4DigitLs.get(key).toString()); + HLA4DigitLs.put(key, likelihood); + n = Integer.valueOf(HLA4DigitCount.get(key).toString()) + 1; + HLA4DigitCount.put(key, n); + aLikelihood4 = aLikelihood4 + Double.valueOf(HLA4DigitA.get(key).toString()); + HLA4DigitA.put(key, aLikelihood4); + pLikelihood4 = pLikelihood4 + Double.valueOf(HLA4DigitP.get(key).toString()); + HLA4DigitP.put(key, pLikelihood4); + } + + } + } + } + + //Print results + Enumeration P = HLA4DigitProbs.keys(); + String K = ""; String [] s, s1, s2; + double count1, count2, locusCount, accountedFor; + + // Sort hashtable. + Vector v = new Vector(HLA4DigitProbs.keySet()); + Collections.sort(v); + + // Display (sorted) hashtable. + for (Enumeration e = v.elements(); e.hasMoreElements();) { + K = (String)e.nextElement(); + prob = (Double) HLA4DigitProbs.get(K); + + likelihood = (Double) HLA4DigitLs.get(K); + count = (Integer) HLA4DigitCount.get(K); + s = K.split("\\,"); + s1 = s[0].split("\\*"); name1 = s1[1]; + s2 = s[1].split("\\*"); name2 = s2[1]; + aLikelihood4 = (Double) HLA4DigitA.get(K); + pLikelihood4 = (Double) HLA4DigitP.get(K); + f1 = (Double) HLA4DigitF1.get(K); + f2 = (Double) HLA4DigitF2.get(K); + count1 = Double.valueOf(AlleleCount.get(s[0]).toString()); + count2 = Double.valueOf(AlleleCount.get(s[1]).toString()); + locusCount = Double.valueOf(LocusCount.get(s1[0]).toString()); + if (s[0].equals(s[1])){ + accountedFor = count1 / locusCount; + }else{ + accountedFor = (count1 + count2) / locusCount; + } + if (prob > 0.1){ + out.printf("%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor); + for (int i = 0; i < Populations.length; i++){ + if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;} + if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;} + if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");} + } + out.print("\n"); + } + out.printf("INFO\t%s\t%s\t%s\t%.1f\t%.1f\t%.2f\t%.2f\t%.1f\t%.2f\t%.0f\t%.0f\t%.0f\t%.2f",s1[0],name1,name2,aLikelihood4/count,pLikelihood4/count,f1,f2,likelihood/count,prob,count1,count2,locusCount,accountedFor); + for (int i = 0; i < Populations.length; i++){ + if (AlleleFrequencies[i].containsKey(s[0])){f1 = Double.valueOf(AlleleFrequencies[i].get(s[0]).toString());}else{f1=.000001;} + if (AlleleFrequencies[i].containsKey(s[1])){f2 = Double.valueOf(AlleleFrequencies[i].get(s[1]).toString());}else{f2=.000001;} + if (!Double.isInfinite(-1*Math.log10(f1*f2))){out.printf("\t%.2f",Math.log10(f1*f2));}else{out.printf("\t-INF");} + } + out.print("\n"); + } + } + + Comparator valueComparator = new Comparator() { + @Override public int compare(Double val1, Double val2) { + return val1.compareTo(val2); + } + }; + + private Integer[] InitializePolymorphicSites(){ + int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1; + int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1; + int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1; + Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions]; + for (int i = 0; i < num_A_positions; i++){ + polymorphicSites[i]=HLA_A_start + i; + } + for (int i = 0; i < num_C_positions; i++){ + polymorphicSites[i+num_A_positions]=HLA_C_start + i; + } + for (int i = 0; i < num_B_positions; i++){ + polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i; + } + return polymorphicSites; + } + + private int IndexOf(char c){ + switch(c){ + case 'A': return 0; + case 'C': return 1; + case 'G': return 2; + case 'T': return 3; + //case 'D': return 4; + default: return -1; + } + } + + + private boolean IsWithinInterval(int pos){ + boolean isWithinInterval = false; + for (int i = 0; i < numIntervals; i++){ + if (pos >= intervals[i][0] && pos <= intervals[i][1]){ + isWithinInterval = true; + break; + } + } + return isWithinInterval; + } + + private void UpdateCorrelation(SAMRecord read){ + //Updates correlation table with SNPs from specific read (for phasing) + String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString()); + ArrayList SNPsInRead = new ArrayList(); + ArrayList readindex = new ArrayList(); + + int readstart = read.getAlignmentStart(); + int readend = read.getAlignmentEnd(); + int numPositions = PolymorphicSites.length; + char c1, c2; + int a, b, i, j, SNPcount = 0; + + //Find all SNPs in read + for (i = 0; i < numPositions; i++){ + if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){ + SNPnumInRead[i] = SNPcount; + SNPposInRead[i] = PolymorphicSites[i]-readstart; + SNPcount++; + }else{ + SNPnumInRead[i] = -1; + SNPposInRead[i] = -1; + } + } + + //Update correlation table; for each combination of SNP positions + for (i = 0; i < numPositions; i++){ + if (SNPnumInRead[i] > -1){ + c1 = s.charAt(SNPposInRead[i]); + if (IndexOf(c1) > -1){ + for (j = i+1; j < numPositions; j ++){ + if (SNPnumInRead[j] > -1){ + c2 = s.charAt(SNPposInRead[j]); + if (IndexOf(c2) > -1){ + a = i*5 + IndexOf(c1); + b = j*5 + IndexOf(c2); + + numObservations[a][b]++; + totalObservations[i][j]++; + if (DEBUG){ + //out.printf("INFO %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b); + } + } + } + } + + } + } + } + } + +private int GenotypeIndex(char a, char b){ + switch(a){ + case 'A': + switch(b){ + case 'A': return 0; + case 'C': return 1; + case 'G': return 2; + case 'T': return 3; + }; + case 'C': + switch(b){ + case 'A': return 1; + case 'C': return 4; + case 'G': return 5; + case 'T': return 6; + }; + case 'G': + switch(b){ + case 'A': return 2; + case 'C': return 5; + case 'G': return 7; + case 'T': return 8; + }; + case 'T': + switch(b){ + case 'A': return 3; + case 'C': return 6; + case 'G': return 8; + case 'T': return 9; + }; + default: return -1; + } + } + + private double CalculateAlleleLikelihood(int a1, int a2, String[] HLAalleles, boolean debug){ + //Calculates likelihood for specific allele pair + String read1 = HLAalleles[a1]; + String read2 = HLAalleles[a2]; + int start1 = HLAstartpos[a1]; + int start2 = HLAstartpos[a2]; + int stop1 = HLAstoppos[a1]; + int stop2 = HLAstoppos[a2]; + double likelihood = 0; + int pos, index; + char c1, c2; + + + for (int i = 0; i < positions.length; i++){ + pos = positions[i]; + if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){ + index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2)); + if (index > -1){ + likelihood = likelihood + baseLikelihoods[i][index]; + if (debug){ + c1 = read1.charAt(pos-start1); + c2 = read2.charAt(pos-start2); + out.printf("INFO: DEBUG %s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood); + } + } + } + } + return likelihood; + } + + private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG, boolean SINGLEALLELE){ + //calculate the likelihood that the particular combination of alleles satisfies the phase count data + double likelihood = 0, prob = 0; + int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1]; + int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2]; + int combinedstart = Math.max(readstart1,readstart2); + int combinedstop = Math.min(readend1,readend2); + + int numPositions = PolymorphicSites.length, SNPcount = 0; + int i, j, a1, a2, b1, b2; + char c11, c12, c21, c22; + int numInPhase = 0, numOutOfPhase = 0; + double sumInPhase = 0.0, sumObservations = 0.0; + + + //Find all SNPs in read + for (i = 0; i < numPositions; i++){ + + if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop ){ // && IsWithinInterval(PolymorphicSites[i]) + if (PRINTDEBUG){ + out.printf("DEBUG\t%s\t%s\n",PolymorphicSites[i],SNPcount); + } + SNPnumInRead[i] = SNPcount; + SNPposInRead[i] = PolymorphicSites[i]-combinedstart; + SNPcount++; + }else{ + SNPnumInRead[i] = -1; + SNPposInRead[i] = -1; + } + } + String s1 = HLAreads[alleleIndex1]; + String s2 = HLAreads[alleleIndex2]; + if (PRINTDEBUG){ + out.printf("DEBUG %s SNPs found in %s and %s between %s and %s\n",SNPcount,HLAnames[alleleIndex1], HLAnames[alleleIndex2],combinedstart,combinedstop); + } + //Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination + for (i = 0; i < numPositions; i++){ + if (SNPnumInRead[i] > -1){ + c11 = s1.charAt(SNPposInRead[i]); + c21 = s2.charAt(SNPposInRead[i]); + if (IndexOf(c11) > -1 && IndexOf(c21) > -1){ + for (j = i+1; j < numPositions; j ++){ + if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){ + c12 = s1.charAt(SNPposInRead[j]); + c22 = s2.charAt(SNPposInRead[j]); + if (IndexOf(c12) > -1 && IndexOf(c22) > -1){ + a1 = i*5 + IndexOf(c11); + b1 = j*5 + IndexOf(c12); + a2 = i*5 + IndexOf(c21); + b2 = j*5 + IndexOf(c22); + //check if the two alleles are identical at the chosen 2 locations + if ((c11 == c21) && (c12 == c22)){ + numInPhase = numObservations[a1][b1]; + }else{ + numInPhase = numObservations[a1][b1] + numObservations[a2][b2]; + } + numOutOfPhase = totalObservations[i][j] - numInPhase; + sumInPhase += (double) numInPhase; + sumObservations += (double) totalObservations[i][j]; + if (SINGLEALLELE){ + likelihood = sumInPhase / sumObservations; + }else{ + likelihood += numInPhase * L_correct + numOutOfPhase * L_err; + } + //prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001); + //likelihood += Math.log10(prob); + //likelihood = Math.max(Math.log10(sumInPhase / sumObservations),-10); + + if (PRINTDEBUG){ + out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%s / %s\t %.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],sumInPhase,sumObservations,likelihood); + } + break; + } + } + } + } + } + } + return likelihood; + } + + private void ExtraCode(){ + String name1, name2; + //Pre-process homozygous combinations to determine top possible alleles (for efficiency) + Hashtable Alleles2Digit = new Hashtable(); + Hashtable Phase2Digit = new Hashtable(); + Hashtable Count2Digit = new Hashtable(); + + Hashtable AllelesAtLocus = new Hashtable(); + ArrayList Loci = new ArrayList(); + double[] AlleleLikelihoods2 = new double[HLAnames.length]; + double[] PhaseLikelihoods2 = new double[HLAnames.length]; + for (int i = 0; i < HLAnames.length; i++){ + name1 = HLAnames[i].substring(4); + String [] n1 = name1.split("\\*"); + AlleleLikelihoods2[i] = CalculateAlleleLikelihood(i,i,HLAreads,false); + PhaseLikelihoods2[i] = CalculatePhaseLikelihood(i,i,false,true); + if (AlleleLikelihoods2[i] < 0){ + name2 = n1[0] + "*" + n1[1].substring(0, 4); + if (!Loci.contains(n1[0])){ + Loci.add(n1[0]); + MaxLikelihoods.put(n1[0], 0.0); + AllelesAtLocus.put(n1[0], 1); + }else{ + AllelesAtLocus.put(n1[0], 1+(Integer)AllelesAtLocus.get(n1[0])); + } + if (!Alleles2Digit.containsKey(name2)){ + Alleles2Digit.put(name2, AlleleLikelihoods2[i]); + Phase2Digit.put(name2, PhaseLikelihoods2[i]); + Count2Digit.put(name2, 1.0); + }else { + if (AlleleLikelihoods2[i] > (Double) Alleles2Digit.get(name2)){ + Alleles2Digit.put(name2, AlleleLikelihoods2[i]); + } + if (PhaseLikelihoods2[i] > (Double) Phase2Digit.get(name2)){ + Phase2Digit.put(name2, PhaseLikelihoods2[i]); + } + Count2Digit.put(name2,1.0+(Double)Count2Digit.get(name2)); + } + } + } + + //Sort alleles at 2 digit resolution for each locus + + for (int i = 0; i < Loci.size(); i++){ + Enumeration k = Alleles2Digit.keys(); + Hashtable AllelesAtLoci = new Hashtable(); + HashMap map = new HashMap(); + int numalleles = 0; + //find alleles at the locus + while( k.hasMoreElements() ){ + name1 = k.nextElement().toString(); + String [] n1 = name1.split("\\*"); + if (Loci.get(i).equals(n1[0])){ + numalleles++; + map.put(name1,-1 * (Double) Alleles2Digit.get(name1)); + AllelesAtLoci.put(-1 * (Double) Alleles2Digit.get(name1), name1); + //out.printf("%s\t%.2f\n",name1,-1 * (Double) Alleles2Digit.get(name1)); + } + + } + + //Sort alleles at locus, mark top six 2-digit classes for deep search + List> entries = new ArrayList>(map.entrySet()); + Collections.sort(entries, new Comparator>() { + public int compare(Entry e1, Entry e2) { + return e1.getValue().compareTo(e2.getValue()); + } + }); + int num = 1; + for (Map.Entry entry : entries) { + if (num <= Math.max(5,entries.size()/8)){ + AllelesToSearch.add(entry.getKey()); + out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey())); + num++; + }else{ + if (!AllelesToSearch.contains(entry.getKey())){ + out.printf("INFO\t%s\t%.2f\t%.2f\tNotSearched\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey())); + }else{ + out.printf("INFO\t%s\t%.2f\t%.2f\n",entry.getKey(),entry.getValue(),Phase2Digit.get(entry.getKey())); + } + } + } + + out.printf("INFO\n"); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java new file mode 100644 index 000000000..fc78ff078 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLAFileReader.java @@ -0,0 +1,79 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.ArrayList; +/** + * + * @author shermanjia + */ +public class HLAFileReader { + ArrayList Sequences = new ArrayList(); + ArrayList Names = new ArrayList(); + ArrayList StartPositions = new ArrayList(); + ArrayList StopPositions = new ArrayList(); + int minstartpos; + int maxstoppos; + + CigarParser formatter = new CigarParser(); + + public String[] GetNames(){ + return Names.toArray(new String[Names.size()]); + } + + public String[] GetSequences(){ + return Sequences.toArray(new String[Sequences.size()]); + } + + public Integer[] GetStartPositions(){ + return StartPositions.toArray(new Integer[StartPositions.size()]); + } + + public Integer[] GetStopPositions(){ + return StopPositions.toArray(new Integer[StopPositions.size()]); + } + + + public Integer GetMinStartPos(){ + return minstartpos; + } + + public Integer GetMaxStopPos(){ + return maxstoppos; + } + + public int GetIndex(String readname){ + if (Names.contains(readname)){ + return Names.indexOf(readname); + }else{ + return -1; + } + } + + public void ReadFile(String filename){ + try{ + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + s = strLine.split("\\t"); + Sequences.add(s[3]); + Names.add(s[0]); + StartPositions.add(Integer.valueOf(s[1])); + StopPositions.add(Integer.valueOf(s[2])); + minstartpos = Math.min(minstartpos, Integer.valueOf(s[1])); + maxstoppos = Math.max(maxstoppos, Integer.valueOf(s[2])); + } + in.close(); + }catch (Exception e){//Catch exception if any + System.err.println("HLAFileReader Error: " + e.getMessage()); + } + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java index a6d840821..73701f2ea 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java @@ -24,6 +24,14 @@ public class PolymorphicSitesFileReader { return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]); } + public void AddSites(Integer [] sites){ + for (int i = 0; i < sites.length; i++){ + if (!PolymorphicSites.contains(sites[i])){ + PolymorphicSites.add(sites[i]); + } + } + } + public void ReadFile(String filename){ try{ FileInputStream fstream = new FileInputStream(filename); @@ -34,10 +42,10 @@ public class PolymorphicSitesFileReader { int i = 0; while ((strLine = br.readLine()) != null) { s = strLine.split("\\t"); - if (s[0].equals("POLYMORPHIC")){ - PolymorphicSites.add(Integer.valueOf(s[2])); + if (Double.valueOf(s[8]) > 0.1){ + PolymorphicSites.add(Integer.valueOf(s[0])); }else{ - NonPolymorphicSites.add(Integer.valueOf(s[2])); + NonPolymorphicSites.add(Integer.valueOf(s[0])); } } in.close(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java index 2c69660e3..7356968ce 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java @@ -14,6 +14,9 @@ import java.util.Hashtable; */ public class SimilarityFileReader { ArrayList ReadsToDiscard = new ArrayList(); + ArrayList AllelesToSearch = new ArrayList(); + Hashtable AlleleCount = new Hashtable(); + Hashtable LocusCount = new Hashtable(); Hashtable Concordance = new Hashtable(); Hashtable NumMatches = new Hashtable(); Hashtable NumMismatches = new Hashtable(); @@ -22,10 +25,22 @@ public class SimilarityFileReader { return ReadsToDiscard; } + public ArrayList GetAllelesToSearch(){ + return AllelesToSearch; + } + public String[] GetReadsToDiscardArray(){ return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]); } + public Hashtable GetAlleleCount(){ + return AlleleCount; + } + + public Hashtable GetLocusCount(){ + return LocusCount; + } + public Hashtable GetConcordance(){ return Concordance; } @@ -43,26 +58,56 @@ public class SimilarityFileReader { FileInputStream fstream = new FileInputStream(filename); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; + String strLine; String [] s = null, alleles = null, a = null; String allele; //Read File Line By Line int i = 0; while ((strLine = br.readLine()) != null) { s = strLine.split("\\t"); if (s.length >= 6){ - Double matchFraction = Double.valueOf(s[4]); - int numMismatches = Integer.valueOf(s[6]); - + Double matchFraction = Double.valueOf(s[3]); + int numMismatches = Integer.valueOf(s[5]); + int numMatches = Integer.valueOf(s[4]); Concordance.put(s[0],matchFraction); - NumMatches.put(s[0], s[5]); + NumMatches.put(s[0], s[4]); NumMismatches.put(s[0], numMismatches); - if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){ + if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){ ReadsToDiscard.add(s[0]); + }else{ + Hashtable fourDigitAlleles = new Hashtable(); + alleles = s[6].split("\\,"); + if (alleles.length > 0){ + a = alleles[0].split("\\_"); + s = a[1].split("\\*"); + if (!LocusCount.containsKey(s[0])){ + LocusCount.put(s[0], 1); + }else{ + LocusCount.put(s[0], (Integer) LocusCount.get(s[0]) + 1); + } + } + for (int j = 0; j < alleles.length; j++){ + a = alleles[j].split("\\_"); + s = a[1].split("\\*"); + allele = s[0] + "*" + s[1].substring(0,4); + + if (!fourDigitAlleles.containsKey(allele)){ + fourDigitAlleles.put(allele, allele); + if (!AlleleCount.containsKey(allele)){ + AlleleCount.put(allele, 1); + }else{ + AlleleCount.put(allele, (Integer) AlleleCount.get(allele) + 1); + } + + if ((Integer) AlleleCount.get(allele) > 1 && !AllelesToSearch.contains(allele)){ + AllelesToSearch.add(allele); + } + } + } } } } in.close(); }catch (Exception e){//Catch exception if any - System.err.println("SimilarityFile Error: " + e.getMessage()); + //System.err.println("SimilarityFile Error: " + e.getMessage()); } } }