diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java index 2e3df0dc0..2266f0835 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java @@ -56,10 +56,10 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker minfrq){ for (int j = i; j < numreads; j++){ name2 = HLAnames[j].substring(4); - if (UniqueAlleles.containsKey(name2)){ + String [] n2 = name2.split("\\*"); +// out.printf("2: %s\n",n2[0] + "*" + n2[1].substring(0, 3)); + if (UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){ + // frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString()); // if (frq2 > minfrq){ if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){ @@ -211,8 +221,8 @@ public class CalculateAlleleLikelihoodsWalker extends ReadWalker minfrq){ for (int j = i; j < HLAnames.length; j++){ name2 = HLAnames[j].substring(4); - if (name1.substring(0,1).equals(name2.substring(0,1))){ - if (UniqueAlleles.containsKey(name2)){ - if (AlleleFrequencies.containsKey(name2)){ - frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString()); - }else{ - frq2 = .0001; - } - // if (frq2 > minfrq){ - likelihood = CalculatePhaseLikelihood(i,j,false); - numCombinations++; - out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2)); - // } + String [] n2 = name2.split("\\*"); + if (n1[0].equals(n2[0]) && UniqueAlleles.containsKey(n2[0] + "*" + n2[1].substring(0, 4))){ + if (AlleleFrequencies.containsKey(name2)){ + frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString()); + }else{ + frq2 = .0001; } + // if (frq2 > minfrq){ + likelihood = CalculatePhaseLikelihood(i,j,false); + numCombinations++; + out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2)); + // } + } } //} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java new file mode 100644 index 000000000..7dd514b3e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ClusterReadsWalker.java @@ -0,0 +1,322 @@ +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.commandline.Argument; + +import java.util.ArrayList; +import java.util.Hashtable; +/** + * Compares reads to longest read at each exon. Usage: java -jar GenomeAnalysisTK.jar -T ClusterReads -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta [-filter INPUT.filter] | grep -v INFO | sort -k1 > OUTPUT + * @author shermanjia + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class ClusterReadsWalker extends ReadWalker { + @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) + public String filterFile = ""; + + @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) + public int MINALLOWEDMISMATCHES = 5; + + String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; + + String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; + + boolean DatabaseLoaded = false; + boolean DEBUG = false; + + Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; + double[] SingleAlleleFrequencies; + ArrayList ReadsToDiscard = new ArrayList(); + ArrayList AlignedReads = new ArrayList(); + + Hashtable MaxNumMatches, MaxConcordance, NumCompared = new Hashtable(); + + double[] nummatched, concordance, numcompared; + + SAMRecord A2, A3, B2, B3, C2, C3; + int MaxMatchesA2 = 0, MaxMatchesA3 = 0, MaxMatchesB2 = 0, MaxMatchesB3 = 0, MaxMatchesC2 = 0, MaxMatchesC3 = 0; + int A2start = 30018513, A2stop = 30018781; + int A3start = 30019024, A3stop = 30019300; + + int C2start = 31347355, C2stop = 31347623; + int C3start = 31346829, C3stop = 31347104; + + int B2start = 31432444, B2stop = 31432714; + int B3start = 31431923, B3stop = 31432198; + + int minstartpos = 0; + int maxstoppos = 0; + + + Hashtable AlleleFrequencies = new Hashtable(); + int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; + CigarParser formatter = new CigarParser(); + + public Integer reduceInit() { + if (!DatabaseLoaded){ + DatabaseLoaded = true; + + PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader(); + siteFileReader.ReadFile(PolymorphicSitesFile); + PolymorphicSites = siteFileReader.GetPolymorphicSites(); + NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites(); + + + out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); + + if (!filterFile.equals("")){ + out.printf("INFO Reading properties file ... "); + SimilarityFileReader similarityReader = new SimilarityFileReader(); + similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); + ReadsToDiscard = similarityReader.GetReadsToDiscard(); + MaxNumMatches = similarityReader.GetNumMatches(); + MaxConcordance = similarityReader.GetConcordance(); + + out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size()); + for (int i = 0; i < ReadsToDiscard.size(); i++){ + out.printf("MISALIGNED %s\n", ReadsToDiscard.get(i).toString()); + } + } + + out.printf("INFO Comparing reads ...\n"); + + if (DEBUG){ + //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); + } + } + return 0; + } + + public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker tracker) { + //Calculate concordance for this read and all overlapping reads + if (!ReadsToDiscard.contains(read.getReadName())){ + AlignedReads.add(read); + + int readstart = read.getAlignmentStart(); + int readstop = read.getAlignmentEnd(); + int length = readstop - readstart + 1; + if (MaxNumMatches.containsKey(read.getReadName())){ + int maxMatches = Integer.parseInt(MaxNumMatches.get(read.getReadName()).toString()); + double concordance = Double.parseDouble((String) MaxConcordance.get(read.getReadName()).toString()); + + if (readstart < A2stop && readstop > A2start){ + if (maxMatches > MaxMatchesA2 && concordance > 0.95){ + MaxMatchesA2 = maxMatches; + A2 = read; + } + } else if (readstart < A3stop && readstop > A3start){ + if (maxMatches > MaxMatchesA3){ + MaxMatchesA3 = maxMatches; + A3 = read; + } + } else if (readstart < B2stop && readstop > B2start){ + if (maxMatches > MaxMatchesB2){ + MaxMatchesB2 = maxMatches; + B2 = read; + } + } else if (readstart < B3stop && readstop > B3start){ + if (maxMatches > MaxMatchesB3){ + MaxMatchesB3 = maxMatches; + B3 = read; + } + } else if (readstart < C2stop && readstop > C2start){ + if (maxMatches > MaxMatchesC2){ + MaxMatchesC2 = maxMatches; + C2 = read; + } + } else if (readstart < C3stop && readstop > C3start){ + if (maxMatches > MaxMatchesC3){ + MaxMatchesC3 = maxMatches; + C3 = read; + } + } + }else{ + out.printf("Data for %s not found\n",read.getReadName()); + } + } + return 1; + } + + public Integer reduce(Integer value, Integer sum) { + + return value + sum; + } + + public void onTraversalDone(Integer numreads) { + SAMRecord read; String name, name2; String locus = ""; + int A2a = 0, A2b = 0, A2c = 0; + int A3a = 0, A3b = 0, A3c = 0; + int B2a = 0, B2b = 0, B2c = 0; + int B3a = 0, B3b = 0, B3c = 0; + int C2a = 0, C2b = 0, C2c = 0; + int C3a = 0, C3b = 0, C3c = 0; + double minA2 = -1, minA3 = -1, minB2 = -1, minB3 = -1, minC2 = -1, minC3 = -1; + double maxA2 = 0, maxA3 = 0, maxB2 = 0, maxB3 = 0, maxC2 = 0, maxC3 = 0; + double ratioA2 = 0, ratioA3 = 0, ratioB2 = 0, ratioB3 = 0, ratioC2 = 0, ratioC3 = 0; + double maxA2l = 0, maxA3l = 0, maxB2l = 0, maxB3l = 0, maxC2l = 0, maxC3l = 0; + double maxA2d = 0, maxA3d = 0, maxB2d = 0, maxB3d = 0, maxC2d = 0, maxC3d = 0; + double a2, a3, b2, b3, c2, c3, normalized = 0; + int readstart, readstop; + double matches, compared, concordance; + STATS stats; + + for (int i = 0; i < AlignedReads.size(); i++){ + read = AlignedReads.get(i); + readstart = read.getAlignmentStart(); + readstop = read.getAlignmentEnd(); + if (readstart < A2stop && readstop > A2start){ + stats = CalculateConcordance(read, A2, "A2", A2start, A2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); + if (stats.getNumCompared() > 40){if (minA2 < 0 || minA2 > concordance){minA2 = concordance;}; if (concordance > maxA2){maxA2 = concordance;}; if (matches > maxA2l){maxA2l = matches;}; if (compared-matches > maxA2d){maxA2d = compared-matches;}}else{A2c++;} + } else if (readstart < A3stop && readstop > A3start){ + stats = CalculateConcordance(read, A3, "A3", A3start, A3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); + if (stats.getNumCompared() > 40){if (minA3 < 0 || minA3 > concordance){minA3 = concordance;}; if (concordance > maxA3){maxA3 = concordance;}; if (matches > maxA3l){maxA3l = matches;}; if (compared-matches > maxA3d){maxA3d = compared-matches;}}else{A3c++;} + } else if (readstart < B2stop && readstop > B2start){ + stats = CalculateConcordance(read, B2, "B2", B2start, B2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); + if (stats.getNumCompared() > 40){if (minB2 < 0 || minB2 > concordance){minB2 = concordance;}; if (concordance > maxB2){maxB2 = concordance;}; if (matches > maxB2l){maxB2l = matches;}; if (compared-matches > maxB2d){maxB2d = compared-matches;}}else{B2c++;} + } else if (readstart < B3stop && readstop > B3start){ + stats = CalculateConcordance(read, B3, "B3", B3start, B3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); + if (stats.getNumCompared() > 40){if (minB3 < 0 || minB3 > concordance){minB3 = concordance;}; if (concordance > maxB3){maxB3 = concordance;}; if (matches > maxB3l){maxB3l = matches;}; if (compared-matches > maxB3d){maxB3d = compared-matches;}}else{B3c++;} + } else if (readstart < C2stop && readstop > C2start){ + stats = CalculateConcordance(read, C2, "C2", C2start, C2stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); + if (stats.getNumCompared() > 40){if (minC2 < 0 || minC2 > concordance){minC2 = concordance;}; if (concordance > maxC2){maxC2 = concordance;}; if (matches > maxC2l){maxC2l = matches;}; if (compared-matches > maxC2d){maxC2d = compared-matches;}}else{C2c++;} + } else if (readstart < C3stop && readstop > C3start){ + stats = CalculateConcordance(read, C3, "C3", C3start, C3stop); concordance = stats.getConcordance(); matches = stats.getNumMatches(); compared = stats.getNumCompared(); + if (stats.getNumCompared() > 40){if (minC3 < 0 || minC3 > concordance){minC3 = concordance;}; if (concordance > maxC3){maxC3 = concordance;}; if (matches > maxC3l){maxC3l = matches;}; if (compared-matches > maxC3d){maxC3d = compared-matches;}}else{C3c++;} + } + } + + + for (int i = 0; i < AlignedReads.size(); i++){ + read = AlignedReads.get(i); + readstart = read.getAlignmentStart(); + readstop = read.getAlignmentEnd(); + name = read.getReadName(); name2 = ""; + if (NumCompared.containsKey(name)){ + compared = Double.parseDouble((String) NumCompared.get(name).toString()); + matches = Double.parseDouble((String) MaxNumMatches.get(name).toString()); + concordance = Double.parseDouble((String) MaxConcordance.get(name).toString()); + if (matches > 40){ + if (readstart < A2stop && readstop > A2start){ + locus = "A2"; name2 = A2.getReadName(); + a2 = (concordance - minA2)/(maxA2-minA2); if (a2 >= .5){A2a++;}else{A2b++;}; normalized = a2; + } else if (readstart < A3stop && readstop > A3start){ + locus = "A3"; name2 = A3.getReadName(); + a3 = (concordance - minA3)/(maxA3-minA3); if (a3 >= .5){A3a++;}else{A3b++;}; normalized = a3; + } else if (readstart < B2stop && readstop > B2start){ + locus = "B2"; name2 = B2.getReadName(); + b2 = (concordance - minB2)/(maxB2-minB2); if (b2 >= .5){B2a++;}else{B2b++;}; normalized = b2; + } else if (readstart < B3stop && readstop > B3start){ + locus = "B3"; name2 = B3.getReadName(); + b3 = (concordance - minB3)/(maxB3-minB3); if (b3 >= .5){B3a++;}else{B3b++;}; normalized = b3; + } else if (readstart < C2stop && readstop > C2start){ + locus = "C2"; name2 = C2.getReadName(); + c2 = (concordance - minC2)/(maxC2-minC2); if (c2 >= .5){C2a++;}else{C2b++;}; normalized = c2; + } else if (readstart < C3stop && readstop > C3start){ + locus = "C3"; name2 = C3.getReadName(); + c3 = (concordance - minC3)/(maxC3-minC3); if (c3 >= .5){C3a++;}else{C3b++;}; normalized = c3; + } + out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\t%.3f\n",locus,name,name2,matches,compared,concordance,normalized); + }else{ + out.printf("%s (compared at %s sites) is too short\n",name,matches); + } + }else{ + out.printf("%s [%s to %s] not found\n",name,readstart,readstop); + } + } + + if (A2a > 0 && A2b > 0){if (A2a > A2b){ratioA2 = (double)A2b/(A2a+A2b);}else{ratioA2 = (double)A2a/(A2a+A2b);}}else{ratioA2 = -1;} + if (A3a > 0 && A3b > 0){if (A3a > A3b){ratioA3 = (double)A3b/(A3a+A3b);}else{ratioA3 = (double)A3a/(A3a+A3b);}}else{ratioA3 = -1;} + if (B2a > 0 && B2b > 0){if (B2a > B2b){ratioB2 = (double)B2b/(B2a+B2b);}else{ratioB2 = (double)B2a/(B2a+B2b);}}else{ratioB2 = -1;} + if (B3a > 0 && B3b > 0){if (B3a > B3b){ratioB3 = (double)B3b/(B3a+B3b);}else{ratioB3 = (double)B3a/(B3a+B3b);}}else{ratioB3 = -1;} + if (C2a > 0 && C2b > 0){if (C2a > C2b){ratioC2 = (double)C2b/(C2a+C2b);}else{ratioC2 = (double)C2a/(C2a+C2b);}}else{ratioC2 = -1;} + if (C3a > 0 && C3b > 0){if (C3a > C3b){ratioC3 = (double)C3b/(C3a+C3b);}else{ratioC3 = (double)C3a/(C3a+C3b);}}else{ratioC3 = -1;} + + out.printf("RATIO_A2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA2,A2a,A2b,A2c,maxA2-minA2,maxA2l,maxA2d); + out.printf("RATIO_A3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioA3,A3a,A3b,A3c,maxA3-minA3,maxA3l,maxA3d); + out.printf("RATIO_B2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB2,B2a,B2b,B2c,maxB2-minB2,maxB2l,maxB2d); + out.printf("RATIO_B3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioB3,B3a,B3b,B3c,maxB3-minB3,maxB3l,maxB3d); + out.printf("RATIO_C2\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC2,C2a,C2b,C2c,maxC2-minC2,maxC2l,maxC2d); + out.printf("RATIO_C3\t%.2f\t%s\t%s\t%s\t%.3f\t%.0f\t%.0f\n",ratioC3,C3a,C3b,C3c,maxC3-minC3,maxC3l,maxC3d); + + } + + public class STATS { + protected double concordance = 0.0; + protected double numcompared = 0; + protected double nummatches = 0; + + public STATS(double d, double i, double m) { + concordance = d; + numcompared = i; + nummatches = m; + } + + public double getConcordance() { + return concordance; + } + + public double getNumCompared() { + return numcompared; + } + + public double getNumMatches() { + return nummatches; + } + } + + private STATS CalculateConcordance(SAMRecord read1, SAMRecord read2, String locus, int start, int stop){ + int start1 = read1.getAlignmentStart(), stop1 = read1.getAlignmentEnd(); + int start2 = read2.getAlignmentStart(), stop2 = read2.getAlignmentEnd(); + + int pos; + double numcompared = 0, nummatched = 0, concordance; + char c1, c2; + String s1 = formatter.FormatRead(read1.getCigarString(), read1.getReadString()); + String s2 = formatter.FormatRead(read2.getCigarString(), read2.getReadString()); + + + //Polymorphic sites: always increment denominator, increment numerator when bases are concordant + for (int j = 0; j < PolymorphicSites.length; j++){ + pos = PolymorphicSites[j]; + if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2 && pos >= start && pos <= stop){ + c1 = s1.charAt(pos-start1); + c2 = s2.charAt(pos-start2); + if (c1 != 'D'){//allow for deletions (sequencing errors) + numcompared++; + if (c1 == c2){ + nummatched++; + } + } + } + } + + //Non-polymorphic sites: increment denominator only when bases are discordant + if (false){ + for (int j = 0; j < NonPolymorphicSites.length; j++){ + pos = NonPolymorphicSites[j]; + if (pos >= start1 && pos <= stop1 && pos >= start2 && pos <= stop2){ + c1 = s1.charAt(pos-start1); + c2 = s2.charAt(pos-start2); + if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors) + numcompared++; + } + } + } + } + + //Update concordance array + concordance=nummatched/numcompared; + + MaxNumMatches.put(read1.getReadName(), nummatched); + NumCompared.put(read1.getReadName(), numcompared); + MaxConcordance.put(read1.getReadName(), concordance); + //out.printf("%s\t%s\t%s\t%.0f\t%.0f\t%.3f\n",locus,read1.getReadName(),read2.getReadName(),nummatched,numcompared,concordance); + + return new STATS(concordance, numcompared, nummatched); + } + +} + + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java index c39041941..e84de332d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreatePedFileWalker.java @@ -52,6 +52,12 @@ public class CreatePedFileWalker extends ReadWalker { @Argument(fullName = "DNAcode", shortName = "DNAcode", doc = "Amino acid codes", required = false) public String dnaCodesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/DNA_CODE.txt"; + @Argument(fullName = "PrintDNA", shortName = "PrintDNA", doc = "Print DNA sequences", required = false) + public boolean PrintDNA = false; + + @Argument(fullName = "PrintAA", shortName = "PrintAA", doc = "Print Amino Acid sequences", required = false) + public boolean PrintAA = true; + String[] HLAnames, HLAreads, inputFileContents; Integer[] HLAstartpos, HLAstoppos; ArrayList HLAnamesAL, HLAreadsAL; @@ -408,23 +414,26 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2 if (true) { - error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end); - error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end); - error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end); - error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end); - error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end); - error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end); - error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end); - error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end); - - error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons); - error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons); - error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons); - error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons); - error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons); - error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons); - error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons); - error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons); + if (PrintDNA){ + error = error + PrintGenotypes(s[1], HLA_A_1,HLA_A_2, HLA_A_start,HLA_A_end); + error = error + PrintGenotypes(s[1], HLA_C_1,HLA_C_2, HLA_C_start,HLA_C_end); + error = error + PrintGenotypes(s[1], HLA_B_1,HLA_B_2, HLA_B_start,HLA_B_end); + error = error + PrintGenotypes(s[1], HLA_DRB1_1,HLA_DRB1_2, HLA_DRB1_start,HLA_DRB1_end); + error = error + PrintGenotypes(s[1], HLA_DQA1_1,HLA_DQA1_2, HLA_DQA1_start,HLA_DQA1_end); + error = error + PrintGenotypes(s[1], HLA_DQB1_1,HLA_DQB1_2, HLA_DQB1_start,HLA_DQB1_end); + error = error + PrintGenotypes(s[1], HLA_DPA1_1,HLA_DPA1_2, HLA_DPA1_start,HLA_DPA1_end); + error = error + PrintGenotypes(s[1], HLA_DPB1_1,HLA_DPB1_2, HLA_DPB1_start,HLA_DPB1_end); + } + if (PrintAA){ + error = error + PrintAminoAcids(s[1], HLA_A_1,HLA_A_2, A_exons); + error = error + PrintAminoAcids(s[1], HLA_C_1,HLA_C_2, C_exons); + error = error + PrintAminoAcids(s[1], HLA_B_1,HLA_B_2, B_exons); + error = error + PrintAminoAcids(s[1], HLA_DRB1_1,HLA_DRB1_2, DRB1_exons); + error = error + PrintAminoAcids(s[1], HLA_DQA1_1,HLA_DQA1_2, DQA1_exons); + error = error + PrintAminoAcids(s[1], HLA_DQB1_1,HLA_DQB1_2, DQB1_exons); + error = error + PrintAminoAcids(s[1], HLA_DPA1_1,HLA_DPA1_2, DPA1_exons); + error = error + PrintAminoAcids(s[1], HLA_DPB1_1,HLA_DPB1_2, DPB1_exons); + } out.printf("\n"); out.printf("%s",error); } @@ -433,23 +442,27 @@ private String PrintAminoAcids(String ID, String alleleName1, String alleleName2 //Prints SNP names for each site if (true){ - PrintSNPS(HLA_A_start,HLA_A_end); - PrintSNPS(HLA_C_start,HLA_C_end); - PrintSNPS(HLA_B_start,HLA_B_end); - PrintSNPS(HLA_DRB1_start,HLA_DRB1_end); - PrintSNPS(HLA_DQA1_start,HLA_DQA1_end); - PrintSNPS(HLA_DQB1_start,HLA_DQB1_end); - PrintSNPS(HLA_DPA1_start,HLA_DPA1_end); - PrintSNPS(HLA_DPB1_start,HLA_DPB1_end); + if (PrintDNA){ + PrintSNPS(HLA_A_start,HLA_A_end); + PrintSNPS(HLA_C_start,HLA_C_end); + PrintSNPS(HLA_B_start,HLA_B_end); + PrintSNPS(HLA_DRB1_start,HLA_DRB1_end); + PrintSNPS(HLA_DQA1_start,HLA_DQA1_end); + PrintSNPS(HLA_DQB1_start,HLA_DQB1_end); + PrintSNPS(HLA_DPA1_start,HLA_DPA1_end); + PrintSNPS(HLA_DPB1_start,HLA_DPB1_end); + } - PrintAminoAcidSites(A_exons,"A",true); - PrintAminoAcidSites(C_exons,"C",false); - PrintAminoAcidSites(B_exons,"B",false); - PrintAminoAcidSites(DRB1_exons,"DRB1",false); - PrintAminoAcidSites(DQA1_exons,"DQA1",true); - PrintAminoAcidSites(DQB1_exons,"DQB1",false); - PrintAminoAcidSites(DPA1_exons,"DPA1",false); - PrintAminoAcidSites(DPB1_exons,"DPB1",true); + if (PrintAA){ + PrintAminoAcidSites(A_exons,"A",true); + PrintAminoAcidSites(C_exons,"C",false); + PrintAminoAcidSites(B_exons,"B",false); + PrintAminoAcidSites(DRB1_exons,"DRB1",false); + PrintAminoAcidSites(DQA1_exons,"DQA1",true); + PrintAminoAcidSites(DQB1_exons,"DQB1",false); + PrintAminoAcidSites(DPA1_exons,"DPA1",false); + PrintAminoAcidSites(DPB1_exons,"DPB1",true); + } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java index 5f45cf504..d5771e4ae 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java @@ -43,6 +43,9 @@ public class FindClosestAlleleWalker extends ReadWalker { @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false) public boolean findFirst = false; + + @Argument(fullName = "DEBUG", shortName = "DEBUG", doc = "Debug walker", required = false) + public boolean debug = false; @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false) public String debugAllele = ""; @@ -50,16 +53,18 @@ public class FindClosestAlleleWalker extends ReadWalker { @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) public String ethnicity = "Caucasian"; + @Argument(fullName = "dictionary", shortName = "dictionary", doc = "bam file of HLA ditionary", required = false) + public String HLAdictionaryFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam"; + @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) public boolean ONLYFREQUENT = false; - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; SAMFileReader HLADictionaryReader = new SAMFileReader(); - String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq"; String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; String AlleleFrequencyFile; - String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; + String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; @@ -74,6 +79,7 @@ public class FindClosestAlleleWalker extends ReadWalker { int numHLAlleles = 0; int minstartpos = 0; int maxstoppos = 0; + int numpolymorphicsites = 0, numnonpolymorphicsites = 0, pos =0; int HLA_A_start = 30018310; int HLA_A_end = 30021211; @@ -89,7 +95,7 @@ public class FindClosestAlleleWalker extends ReadWalker { //Load HLA dictionary out.printf("INFO Loading HLA dictionary ... "); - HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLADictionaryReader.ReadFile(HLAdictionaryFile); HLAreads = HLADictionaryReader.GetReads(); HLAnames = HLADictionaryReader.GetReadNames(); HLAstartpos = HLADictionaryReader.GetStartPositions(); @@ -121,7 +127,8 @@ public class FindClosestAlleleWalker extends ReadWalker { siteFileReader.ReadFile(PolymorphicSitesFile); PolymorphicSites = siteFileReader.GetPolymorphicSites(); NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites(); - + numpolymorphicsites = PolymorphicSites.length; + numnonpolymorphicsites = NonPolymorphicSites.length; out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); out.printf("INFO Comparing reads to database ...\n"); @@ -133,53 +140,15 @@ public class FindClosestAlleleWalker extends ReadWalker { return 0; } - private void FindPolymorphicSites(int start, int stop){ - boolean initialized, polymorphic, examined; - char c = ' '; - ArrayList polymorphicsites = new ArrayList(); - ArrayList nonpolymorphicsites = new ArrayList(); - //Find polymorphic sites in dictionary - for (int pos = start; pos <= stop; pos++){ - initialized = false; polymorphic = false; examined = false; - //look across all alleles at specific position to see if it is polymorphic - for (int i = 0; i < HLAreads.length; i++){ - if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){ - if (!initialized){ - c = HLAreads[i].charAt(pos-HLAstartpos[i]); - initialized = true; - examined = true; - } - if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){ - polymorphicsites.add(pos); - out.printf("POLYMORPHIC\t6\t%s\n", pos); - polymorphic = true; - break; - } - } - - } - if (!polymorphic && examined){ - nonpolymorphicsites.add(pos); - out.printf("CONSERVED\t6\t%s\n", pos); - } - } - PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]); - NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]); - } - private double CalculateConcordance(SAMRecord read){ int readstart = read.getAlignmentStart(); int readstop = read.getAlignmentEnd(); - int numpolymorphicsites, numnonpolymorphicsites, pos; char c1, c2; double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0; String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString()); String s2; int allelestart, allelestop; - numpolymorphicsites = PolymorphicSites.length; - numnonpolymorphicsites = NonPolymorphicSites.length; - if (ONLYFREQUENT){ minFreq = 0.0001; } @@ -200,7 +169,7 @@ public class FindClosestAlleleWalker extends ReadWalker { if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ c1 = s1.charAt(pos-readstart); c2 = s2.charAt(pos-allelestart); - if (c1 != 'D'){//allow for deletions (sequencing errors) + if (c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors) numcompared[i]++; if (c1 == c2){ nummatched[i]++; @@ -214,16 +183,17 @@ public class FindClosestAlleleWalker extends ReadWalker { } //Non-polymorphic sites: increment denominator only when bases are discordant - - for (int j = 0; j < numnonpolymorphicsites; j++){ - pos = NonPolymorphicSites[j]; - if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ - c1 = s1.charAt(pos-readstart); - c2 = s2.charAt(pos-allelestart); - if (c1 != c2 && c1 != 'D'){//allow for deletions (sequencing errors) - numcompared[i]++; - if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ - out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); + if (numcompared[i] > 0){ + for (int j = 0; j < numnonpolymorphicsites; j++){ + pos = NonPolymorphicSites[j]; + if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ + c1 = s1.charAt(pos-readstart); + c2 = s2.charAt(pos-allelestart); + if (c1 != c2 && c1 != 'D' && c2 != 'D'){//allow for deletions (sequencing errors) + numcompared[i]++; + if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ + out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); + } } } } @@ -259,26 +229,32 @@ public class FindClosestAlleleWalker extends ReadWalker { public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { //Calculate concordance for this read and all overlapping reads - double maxConcordance = CalculateConcordance(read); + if (read.getMappingQuality() > 0){ + double maxConcordance = CalculateConcordance(read); - String readname = read.getReadName(), allelename = ""; double freq; - //For input bam files that contain HLA alleles, find and print allele frequency - freq = GetAlleleFrequency(readname); - out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd()); + if (maxConcordance > 0){ + String readname = read.getReadName(), allelename = ""; double freq; + //For input bam files that contain HLA alleles, find and print allele frequency + //freq = GetAlleleFrequency(readname); + out.printf("%s\t%s-%s", readname,read.getAlignmentStart(),read.getAlignmentEnd()); - //Find the maximum frequency of the alleles most concordant with the read - double maxFreq = FindMaxAlleleFrequency(maxConcordance); - - //Print concordance statistics between this read and the most similar HLA allele(s) - for (int i = 0; i < HLAreads.length; i++){ - if (concordance[i] == maxConcordance && maxConcordance > 0){ - freq = GetAlleleFrequency(HLAnames[i]); - if (freq == maxFreq){ - out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); + //Find the maximum frequency of the alleles most concordant with the read + //double maxFreq = FindMaxAlleleFrequency(maxConcordance); + + //Print concordance statistics between this read and the most similar HLA allele(s) + + for (int i = 0; i < HLAreads.length; i++){ + if (concordance[i] == maxConcordance){ + freq = GetAlleleFrequency(HLAnames[i]); + //if (freq == maxFreq){ + out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); + //} + break; + } } + out.print("\n"); } } - out.print("\n"); return 1; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java new file mode 100644 index 000000000..9f8a16cd8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindPolymorphicSitesWalker.java @@ -0,0 +1,152 @@ +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.commandline.Argument; + +import java.util.ArrayList; +import java.util.Hashtable; +/** + * Finds polymorphic sites in the HLA dictionary. Usage: java -jar GenomeAnalysisTK.jar -T FindPolymorphicSites -I HLA_DICTIONARY.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT + * @author shermanjia + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class FindPolymorphicSitesWalker extends ReadWalker { + @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false) + public String debugRead = ""; + + @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false) + public boolean findFirst = false; + + @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false) + public String debugAllele = ""; + + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "Caucasian"; + + @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) + public boolean ONLYFREQUENT = false; + + //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; + String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam"; + + SAMFileReader HLADictionaryReader = new SAMFileReader(); + + //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq"; + String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; + String AlleleFrequencyFile; + String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; + + String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; + + boolean DatabaseLoaded = false; + boolean DEBUG = false; + + String[] HLAnames, HLAreads; + Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; + double[] SingleAlleleFrequencies; + + double[] nummatched, concordance, numcompared; + int numHLAlleles = 0; + int minstartpos = 0; + int maxstoppos = 0; + + int HLA_A_start = 30018310; + int HLA_A_end = 30021211; + + Hashtable AlleleFrequencies = new Hashtable(); + int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; + CigarParser formatter = new CigarParser(); + + public Integer reduceInit() { + if (!DatabaseLoaded){ + DatabaseLoaded = true; + + //Load HLA dictionary + out.printf("INFO Loading HLA dictionary ... "); + + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetReads(); + HLAnames = HLADictionaryReader.GetReadNames(); + HLAstartpos = HLADictionaryReader.GetStartPositions(); + HLAstoppos = HLADictionaryReader.GetStopPositions(); + minstartpos = HLADictionaryReader.GetMinStartPos(); + maxstoppos = HLADictionaryReader.GetMaxStopPos(); + + out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); + + nummatched = new double[HLAreads.length]; + concordance = new double[HLAreads.length]; + numcompared = new double[HLAreads.length]; + + //Read allele frequencies + if (ethnicity.equals("Black")){ + AlleleFrequencyFile = BlackAlleleFrequencyFile; + }else{ + AlleleFrequencyFile = CaucasianAlleleFrequencyFile; + } + out.printf("INFO Reading HLA allele frequencies ... "); + FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); + HLAfreqReader.ReadFile(AlleleFrequencyFile,UniqueAllelesFile); + AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); + out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); + + FindPolymorphicSites(minstartpos,maxstoppos); + + out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); + out.printf("INFO Comparing reads to database ...\n"); + + if (DEBUG){ + //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); + } + } + return 0; + } + + private void FindPolymorphicSites(int start, int stop){ + boolean initialized, polymorphic, examined; + char c = ' '; + ArrayList polymorphicsites = new ArrayList(); + ArrayList nonpolymorphicsites = new ArrayList(); + //Find polymorphic sites in dictionary + for (int pos = start; pos <= stop; pos++){ + initialized = false; polymorphic = false; examined = false; + //look across all alleles at specific position to see if it is polymorphic + for (int i = 0; i < HLAreads.length; i++){ + if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){ + if (!initialized){ + c = HLAreads[i].charAt(pos-HLAstartpos[i]); + initialized = true; + examined = true; + } + if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){ + polymorphicsites.add(pos); + out.printf("POLYMORPHIC\t6\t%s\n", pos); + polymorphic = true; + break; + } + } + + } + if (!polymorphic && examined){ + nonpolymorphicsites.add(pos); + out.printf("CONSERVED\t6\t%s\n", pos); + } + } + PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]); + NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]); + } + + public Integer map(char[] ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + //Calculate concordance for this read and all overlapping reads + return 1; + } + + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java index dacacfb3c..3af3bf0ac 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java @@ -16,8 +16,9 @@ import java.util.Hashtable; */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class ImputeAllelesWalker extends ReadWalker { - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam"; - String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.closest"; + String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam"; +// String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS1.closest"; + String ClosestAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.CLASS2.closest"; boolean DatabaseLoaded = false; boolean DEBUG = false; @@ -40,14 +41,24 @@ public class ImputeAllelesWalker extends ReadWalker { int HLA_B_end = 31432914; int HLA_C_start = 31344925; int HLA_C_end = 31347827; + int HLA_DQA1_start = 32713161; + int HLA_DQA1_end = 32719407; + int HLA_DQB1_start = 32735635; + int HLA_DQB1_end = 32742444; + int HLA_DPA1_start = 33140772; + int HLA_DPA1_end = 33149356; + int HLA_DPB1_start = 33151738; + int HLA_DPB1_end = 33162954; + int HLA_DRB1_start = 32654525; + int HLA_DRB1_end = 32665540; ArrayList PolymorphicSites = new ArrayList(); Hashtable ClosestAllele = new Hashtable(); - int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; + int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1, iDRBstart = -1, iDRBstop = -1, iDQAstart = -1, iDQAstop = -1, iDQBstart = -1, iDQBstop = -1, iDPAstart = -1, iDPAstop = -1, iDPBstart = -1, iDPBstop = -1; CigarParser formatter = new CigarParser(); - + public Integer reduceInit() { if (!DatabaseLoaded){ try{ @@ -77,6 +88,21 @@ public class ImputeAllelesWalker extends ReadWalker { }else if (s[0].indexOf("HLA_C") > -1){ if (iCstart < 0){iCstart=i;} iCstop = i; i++; + }else if (s[0].indexOf("HLA_DRB1") > -1){ + if (iDRBstart < 0){iDRBstart=i;} + iDRBstop = i; i++; + }else if (s[0].indexOf("HLA_DQA1") > -1){ + if (iDQAstart < 0){iDQAstart=i;} + iDQAstop = i; i++; + }else if (s[0].indexOf("HLA_DQB1") > -1){ + if (iDQBstart < 0){iDQBstart=i;} + iDQBstop = i; i++; + }else if (s[0].indexOf("HLA_DPA1") > -1){ + if (iDPAstart < 0){iDPAstart=i;} + iDPAstop = i; i++; + }else if (s[0].indexOf("HLA_DPB1") > -1){ + if (iDPBstart < 0){iDPBstart=i;} + iDPBstop = i; i++; } } } @@ -146,10 +172,12 @@ public class ImputeAllelesWalker extends ReadWalker { int numM = 0, numI = 0, numD = 0; name = read.getReadName(); + String matchedAllele = (String) ClosestAllele.get(name); //out.printf("%s\t%s\n",name,matchedAllele); int index = HLAnames.indexOf(matchedAllele); + String matchedRead = HLAreads.get(index); if (name.indexOf("HLA_A") > -1){ @@ -161,8 +189,24 @@ public class ImputeAllelesWalker extends ReadWalker { } else if (name.indexOf("HLA_C") > -1){ startimputation = HLA_C_start; stopimputation = HLA_C_end; + } else if (name.indexOf("HLA_DRB1") > -1){ + startimputation = HLA_DRB1_start; + stopimputation = HLA_DRB1_end; + } else if (name.indexOf("HLA_DQA1") > -1){ + startimputation = HLA_DQA1_start; + stopimputation = HLA_DQA1_end; + } else if (name.indexOf("HLA_DQB1") > -1){ + startimputation = HLA_DQB1_start; + stopimputation = HLA_DQB1_end; + } else if (name.indexOf("HLA_DPA1") > -1){ + startimputation = HLA_DPA1_start; + stopimputation = HLA_DPA1_end; + } else if (name.indexOf("HLA_DPB1") > -1){ + startimputation = HLA_DPB1_start; + stopimputation = HLA_DPB1_end; } + //out.printf("DEBUG %s\t%s\t%s\t%s\t%s\n",name,matchedAllele,index,startimputation,stopimputation); for (int i = startimputation; i <= stopimputation; i++){ //if position is within read if (i >= readstart && i <= readstop){ diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java index 30086532b..2c69660e3 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java @@ -7,12 +7,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; import java.io.*; import java.util.ArrayList; +import java.util.Hashtable; /** * * @author shermanjia */ public class SimilarityFileReader { ArrayList ReadsToDiscard = new ArrayList(); + Hashtable Concordance = new Hashtable(); + Hashtable NumMatches = new Hashtable(); + Hashtable NumMismatches = new Hashtable(); public ArrayList GetReadsToDiscard(){ return ReadsToDiscard; @@ -22,6 +26,18 @@ public class SimilarityFileReader { return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]); } + public Hashtable GetConcordance(){ + return Concordance; + } + + public Hashtable GetNumMatches(){ + return NumMatches; + } + + public Hashtable GetNumMismatches(){ + return NumMismatches; + } + public void ReadFile(String filename, int minAllowedMismatches){ try{ FileInputStream fstream = new FileInputStream(filename); @@ -35,6 +51,10 @@ public class SimilarityFileReader { if (s.length >= 6){ Double matchFraction = Double.valueOf(s[4]); int numMismatches = Integer.valueOf(s[6]); + + Concordance.put(s[0],matchFraction); + NumMatches.put(s[0], s[5]); + NumMismatches.put(s[0], numMismatches); if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches > minAllowedMismatches)){ ReadsToDiscard.add(s[0]); }