diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java new file mode 100644 index 000000000..cc24eb28d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/BaseLikelihoodsFileReader.java @@ -0,0 +1,102 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.ArrayList; +/** + * + * @author shermanjia + */ +public class BaseLikelihoodsFileReader { + double[][] baseLikelihoods; + int[] positions; + ArrayList polymorphicSites = new ArrayList(); + + public Integer[] GetPolymorphicSites(){ + return polymorphicSites.toArray(new Integer[polymorphicSites.size()]); + } + + public double[][] GetBaseLikelihoods(){ + return baseLikelihoods; + } + + public int[] GetPositions(){ + return positions; + } + + public void ReadFile(String filename, boolean findPolymorphicSites){ + try{ + //System.out.printf("INFO Reading base likelihoods file ... "); + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null, pos = null; + //Determine size of file + int n = 0; char ref; + while ((strLine = br.readLine()) != null) { + if (strLine.indexOf("INFO") == -1){n++;} + } + + baseLikelihoods = new double[n][10]; + positions = new int[n]; + double[] localLikelihoods = new double[10]; + + + //System.out.printf("%s lines of data found ... ",n); + in.close(); + //Read and store data + + fstream = new FileInputStream(filename); + in = new DataInputStream(fstream); + br = new BufferedReader(new InputStreamReader(in)); + n = 0; + while ((strLine = br.readLine()) != null) { + if (strLine.indexOf("INFO") == -1){ + s = strLine.split("\\t"); + pos = s[0].split(":"); + ref = s[1].charAt(0); + positions[n] = Integer.valueOf(pos[1]); + for (int i = 3; i <= 12; i++){ + baseLikelihoods[n][i-3] = Double.valueOf(s[i]); + localLikelihoods[i-3] = baseLikelihoods[n][i-3]; + } + if (findPolymorphicSites){ + if (IsPolymorphicSite(localLikelihoods,ref)){ + polymorphicSites.add(positions[n]); + } + } + n++; + } + } + + }catch (Exception e){//Catch exception if any + System.err.println("BaseLikelihoodsFileReader Error: " + e.getMessage()); + } + } + + private int IndexOf(char c){ + switch(c){ + case 'A': return 0; + case 'C': return 4; + case 'G': return 7; + case 'T': return 9; + default: return -1; + } + } + + private boolean IsPolymorphicSite(double[] likelihoods, char ref){ + boolean isPolymorphicSite = false; + double homreflikelihood = likelihoods[IndexOf(ref)]; + for (int i = 0; i < 10; i++){ + if (likelihoods[i] > homreflikelihood){ + isPolymorphicSite = true; + } + } + return isPolymorphicSite; + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java new file mode 100644 index 000000000..6fb482efc --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateAlleleLikelihoodsWalker.java @@ -0,0 +1,209 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.io.FileInputStream; +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Hashtable; + + +/** + * + * @author shermanjia + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CalculateAlleleLikelihoodsWalker extends ReadWalker { + @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true) + public String baseLikelihoodsFile = ""; + + @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) + public boolean DEBUG = false; + + @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) + public boolean FREQUENT = false; + + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "Caucasian"; + + String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; + String AlleleFrequencyFile; + Hashtable AlleleFrequencies; + + CigarParser formatter = new CigarParser(); + double[][] baseLikelihoods; + int[] positions; + boolean loaded = false; + + String[] HLAnames, HLAreads; + Integer[] HLAstartpos, HLAstoppos; + ArrayList HLAnamesAL, HLAreadsAL; + ArrayList HLAstartposAL, HLAstopposAL; + + public Integer reduceInit() { + if (!loaded){ + loaded = true; + BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader(); + baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, false); + baseLikelihoods = baseLikelihoodsReader.GetBaseLikelihoods(); + positions = baseLikelihoodsReader.GetPositions(); + + HLAnamesAL = new ArrayList(); + HLAreadsAL = new ArrayList(); + HLAstartposAL = new ArrayList(); + HLAstopposAL = new ArrayList(); + + if (ethnicity.equals("Black")){ + AlleleFrequencyFile = BlackAlleleFrequencyFile; + }else{ + AlleleFrequencyFile = CaucasianAlleleFrequencyFile; + } + out.printf("INFO Reading HLA allele frequencies ... "); + FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); + HLAfreqReader.ReadFile(AlleleFrequencyFile); + AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); + out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); + + out.printf("INFO Reading HLA dictionary ..."); + + + } + return 0; + } + + public Integer map(char[] ref, SAMRecord read) { + HLAnamesAL.add(read.getReadName()); + HLAreadsAL.add(formatter.FormatRead(read.getCigarString(), read.getReadString())); + HLAstartposAL.add(read.getAlignmentStart()); + HLAstopposAL.add(read.getAlignmentEnd()); + return 1; + } + + private int GenotypeIndex(char a, char b){ + switch(a){ + case 'A': + switch(b){ + case 'A': return 0; + case 'C': return 1; + case 'G': return 2; + case 'T': return 3; + }; + case 'C': + switch(b){ + case 'A': return 1; + case 'C': return 4; + case 'G': return 5; + case 'T': return 6; + }; + case 'G': + switch(b){ + case 'A': return 2; + case 'C': return 5; + case 'G': return 7; + case 'T': return 8; + }; + case 'T': + switch(b){ + case 'A': return 3; + case 'C': return 6; + case 'G': return 8; + case 'T': return 9; + }; + default: return -1; + } + } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraversalDone(Integer numreads) { + out.printf("Done! %s alleles found\n", numreads); + HLAnames = HLAnamesAL.toArray(new String[numreads]); + HLAreads = HLAreadsAL.toArray(new String[numreads]); + HLAstartpos = HLAstartposAL.toArray(new Integer[numreads]); + HLAstoppos = HLAstopposAL.toArray(new Integer[numreads]); + + double[][] AlleleLikelihoods = new double[numreads][numreads]; + + String name1, name2; + double frq1, frq2; + + + double minfrq = 0; + if (FREQUENT){ + minfrq = 0.0001; + } + int numcombinations = 0; + out.printf("NUM\tAllele1\tAllele2\tSSG\n"); + + for (int i = 0; i < numreads; i++){ + name1 = HLAnames[i].substring(4); + if (AlleleFrequencies.containsKey(name1)){ + frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString()); + if (frq1 > minfrq){ + for (int j = i; j < numreads; j++){ + name2 = HLAnames[j].substring(4); + if (AlleleFrequencies.containsKey(name2)){ + frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString()); + if (frq2 > minfrq){ + if ((HLAstartpos[i] < HLAstoppos[j]) && (HLAstartpos[j] < HLAstoppos[i])){ + numcombinations++; + AlleleLikelihoods[i][j] = CalculateLikelihood(i,j); + out.printf("%s\t%s\t%s\t%.2f\n",numcombinations,name1,name2,AlleleLikelihoods[i][j]); + } + }else{ + if (DEBUG){out.printf("%s has allele frequency%.5f\n",name2,frq2);} + } + }else{ + if (DEBUG){out.printf("%s not found in allele frequency file\n",name2);} + } + } + }else{ + if (DEBUG){out.printf("%s has allele frequency%.5f\n",name1,frq1);} + } + }else{ + if (DEBUG){out.printf("%s not found in allele frequency file\n",name1);} + } + } + } + + private double CalculateLikelihood(int a1, int a2){ + String read1 = HLAreads[a1]; + String read2 = HLAreads[a2]; + int start1 = HLAstartpos[a1]; + int start2 = HLAstartpos[a2]; + int stop1 = HLAstoppos[a1]; + int stop2 = HLAstoppos[a2]; + double likelihood = 0; + int pos, index; + char c1, c2; + + + for (int i = 0; i < positions.length; i++){ + pos = positions[i]; + if (pos < stop1 && pos > start1 && pos < stop2 && pos > start2){ + index = GenotypeIndex(read1.charAt(pos-start1),read2.charAt(pos-start2)); + if (index > -1){ + likelihood = likelihood + baseLikelihoods[i][index]; + }else{ + /*if (DEBUG){ + c1 = read1.charAt(pos-start1); + c2 = read2.charAt(pos-start2); + out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%.2f\n",HLAnames[a1],HLAnames[a2],pos,c1,c2,index,likelihood); + }*/ + } + } + } + return likelihood; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java new file mode 100644 index 000000000..888f3bc1f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java @@ -0,0 +1,163 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.List; +/** + * + * @author shermanjia + */ +public class CalculateBaseLikelihoodsWalker extends LocusWalker>{ + @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) + public boolean DEBUG = false; + @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) + public String inputAlleles = ""; + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "Caucasian"; + @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) + public String filterFile = ""; + + String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; + String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; + String AlleleFrequencyFile; + + SAMFileReader HLADictionaryReader = new SAMFileReader(); + String[] HLAreads, HLAnames; + Integer[] HLAstartpos, HLAstoppos; + Hashtable AlleleFrequencies; + + int[][] LOD, LikelihoodScores; + ArrayList ReadsToDiscard = new ArrayList(); + + ArrayList AllReads = new ArrayList(); + ArrayList AllReadNames = new ArrayList(); + + boolean HLAdataLoaded = false; + + + //Loads HLA dictionary, allele frequencies, and reads to filter + public Pair reduceInit() { + if (!HLAdataLoaded){ + HLAdataLoaded = true; + + out.printf("INFO Reading HLA database ... "); + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetReads(); + HLAnames = HLADictionaryReader.GetReadNames(); + HLAstartpos = HLADictionaryReader.GetStartPositions(); + HLAstoppos = HLADictionaryReader.GetStopPositions(); + InitializeVariables(HLAreads.length); + out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); + + + + if (ethnicity.equals("Black")){ + AlleleFrequencyFile = BlackAlleleFrequencyFile; + }else{ + AlleleFrequencyFile = CaucasianAlleleFrequencyFile; + } + out.printf("INFO Reading HLA allele frequencies ... "); + FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); + HLAfreqReader.ReadFile(AlleleFrequencyFile); + AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); + out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); + + + if (!filterFile.equals("")){ + out.printf("INFO Reading properties file ... "); + SimilarityFileReader similarityReader = new SimilarityFileReader(); + similarityReader.ReadFile(filterFile); + ReadsToDiscard = similarityReader.GetReadsToDiscard(); + out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size()); + } + } + return new Pair(0l,0l); + } + + + + private void InitializeVariables(int n){ + LOD = new int[n][n]; + LikelihoodScores = new int[n][n]; + for (int i = 0; i < n; i++){ + + for (int j = 0; j reads = context.getReads(); + if(reads.size() > 0 ) { + List offsets = context.getOffsets(); + + int numAs = 0, numCs = 0, numGs = 0, numTs = 0; + //if (DEBUG){ + out.printf("%s\t%s\t", context.getLocation(),ref.getBase()); + //} + + //Calculate posterior probabilities + GenotypeLikelihoods G = new ThreeStateErrorGenotypeLikelihoods(); + SAMRecord read; int offset; char base; byte qual; int mapquality; String readname; + + //Check for bad bases and ensure mapping quality + for (int i = 0; i < reads.size(); i++) { + read = reads.get(i); + offset = offsets.get(i); + base = read.getReadString().charAt(offset); + qual = read.getBaseQualities()[offset]; + //mapquality = read.getMappingQuality(); + if (!ReadsToDiscard.contains(read.getReadName()) && BaseUtils.simpleBaseToBaseIndex(base) != -1) { + + //consider base in likelihood calculations if it looks good and has high mapping score + G.add(base, qual, read, offset); + if (DEBUG){ + if (base == 'A'){numAs++;} + else if (base == 'C'){numCs++;} + else if (base == 'T'){numTs++;} + else if (base == 'G'){numGs++;} + } + + } + } + //if (DEBUG) { + out.printf("A[%s]C[%s]T[%s]G[%s]",numAs,numCs,numTs,numGs); + for ( DiploidGenotype g : DiploidGenotype.values() ) { + out.printf("\t%.2f",G.getLikelihood(g)); + } + out.printf("\n"); + //} + + } + return context.getReads().size(); + } + + public Pair reduce(Integer value, Pair sum) { + long left = value.longValue() + sum.getFirst(); + long right = sum.getSecond() + 1l; + return new Pair(left, right); + } + + public void onTraversalDone(Pair result) { + + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java new file mode 100644 index 000000000..19ec64bad --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculatePhaseLikelihoodsWalker.java @@ -0,0 +1,314 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.ArrayList; +import java.util.Hashtable; + +/** + * + * @author shermanjia + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CalculatePhaseLikelihoodsWalker extends ReadWalker { + @Argument(fullName = "baseLikelihoods", shortName = "bl", doc = "Base likelihoods file", required = true) + public String baseLikelihoodsFile = ""; + + @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) + public boolean DEBUG = false; + + @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) + public String filterFile = ""; + + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "Caucasian"; + + @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) + public String debugAlleles = ""; + + @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) + public boolean ONLYFREQUENT = false; + + String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; + String AlleleFrequencyFile; + + String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; + SAMFileReader HLADictionaryReader = new SAMFileReader(); + boolean HLAdataLoaded = false; + String[] HLAnames, HLAreads; + ArrayList ReadsToDiscard; + Integer[] HLAstartpos, HLAstoppos, PolymorphicSites; + + int[][] numObservations, totalObservations; + int[] SNPnumInRead, SNPposInRead; + CigarParser cigarparser = new CigarParser(); + Hashtable AlleleFrequencies; + + public Integer reduceInit() { + + if (!HLAdataLoaded){ + HLAdataLoaded = true; + //Load HLA dictionary + out.printf("INFO Loading HLA dictionary ... "); + + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetReads(); + HLAnames = HLADictionaryReader.GetReadNames(); + HLAstartpos = HLADictionaryReader.GetStartPositions(); + HLAstoppos = HLADictionaryReader.GetStopPositions(); + out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); + + if (!filterFile.equals("")){ + out.printf("INFO Reading properties file ... "); + SimilarityFileReader similarityReader = new SimilarityFileReader(); + similarityReader.ReadFile(filterFile); + ReadsToDiscard = similarityReader.GetReadsToDiscard(); + out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size()); + } + + + //Reading base likelihoods + if (!baseLikelihoodsFile.equals("")){ + //Get only sites found to be different from reference + out.printf("INFO Reading base likelihoods file ... "); + BaseLikelihoodsFileReader baseLikelihoodsReader = new BaseLikelihoodsFileReader(); + baseLikelihoodsReader.ReadFile(baseLikelihoodsFile, true); + PolymorphicSites = baseLikelihoodsReader.GetPolymorphicSites(); + out.printf("%s polymorphic sites found\n",PolymorphicSites.length); + }else{ + //use all sites in class 1 HLA + out.printf("INFO Using all positions in the classical HLA genes ... "); + PolymorphicSites = InitializePolymorphicSites(); + } + int l = PolymorphicSites.length; + SNPnumInRead = new int[l]; + SNPposInRead = new int[l]; + numObservations = new int[l*5][l*5]; + totalObservations = new int[l][l]; + + //Read allele frequencies + if (ethnicity.equals("Black")){ + AlleleFrequencyFile = BlackAlleleFrequencyFile; + }else{ + AlleleFrequencyFile = CaucasianAlleleFrequencyFile; + } + out.printf("INFO Reading HLA allele frequencies ... "); + FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); + HLAfreqReader.ReadFile(AlleleFrequencyFile); + AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); + out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); + + /* + PolymorphicSites = FindPolymorphicSites(HLADictionaryReader.GetMinStartPos(),HLADictionaryReader.GetMaxStopPos()); + */ + + } + return 0; + } + + + public Integer map(char[] ref, SAMRecord read) { + if (!ReadsToDiscard.contains(read.getReadName())){ + UpdateCorrelation(read); + }else{ + + } + return 1; + } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraversalDone(Integer numreads) { + String name1, name2; + Double frq1, frq2, likelihood, minfrq = 0.0; + int numCombinations = 0; + + + if (!debugAlleles.equals("")){ + String s[] = debugAlleles.split(","); + int index1 = HLADictionaryReader.GetReadIndex(s[0]); + int index2 = HLADictionaryReader.GetReadIndex(s[1]); + if (index1 > -1 && index2 > -1){ + likelihood = CalculatePhaseLikelihood(index1,index2,true); + } + } + + if (ONLYFREQUENT){ + minfrq = 0.0001; + } + + out.printf("NUM\tAllele1\tAllele2\tPhase\tFrq1\tFrq2\n"); + for (int i = 0; i < HLAnames.length; i++){ + name1 = HLAnames[i].substring(4); + if (AlleleFrequencies.containsKey(name1)){ + frq1 = Double.parseDouble((String) AlleleFrequencies.get(name1).toString()); + if (frq1 > minfrq){ + for (int j = i; j < HLAnames.length; j++){ + name2 = HLAnames[j].substring(4); + if (name1.substring(0,1).equals(name2.substring(0,1))){ + if (AlleleFrequencies.containsKey(name2)){ + frq2 = Double.parseDouble((String) AlleleFrequencies.get(name2).toString()); + if (frq2 > minfrq){ + likelihood = CalculatePhaseLikelihood(i,j,false); + numCombinations++; + out.printf("%s\t%s\t%s\t%.2f\t%.2f\t%.2f\n",numCombinations,name1,name2,likelihood,Math.log10(frq1),Math.log10(frq2)); + } + } + } + } + } + } + } + } + + private Integer[] InitializePolymorphicSites(){ + int HLA_A_start = 30018310, HLA_A_end = 30021211, num_A_positions = HLA_A_end - HLA_A_start + 1; + int HLA_B_start = 31430239, HLA_B_end = 31432914, num_B_positions = HLA_B_end - HLA_B_start + 1; + int HLA_C_start = 31344925, HLA_C_end = 31347827, num_C_positions = HLA_C_end - HLA_C_start + 1; + Integer[] polymorphicSites = new Integer[num_A_positions+num_B_positions+num_C_positions]; + for (int i = 0; i < num_A_positions; i++){ + polymorphicSites[i]=HLA_A_start + i; + } + for (int i = 0; i < num_C_positions; i++){ + polymorphicSites[i+num_A_positions]=HLA_C_start + i; + } + for (int i = 0; i < num_B_positions; i++){ + polymorphicSites[i+num_A_positions+num_C_positions]=HLA_B_start + i; + } + return polymorphicSites; + } + + private int IndexOf(char c){ + switch(c){ + case 'A': return 0; + case 'C': return 1; + case 'G': return 2; + case 'T': return 3; + //case 'D': return 4; + default: return -1; + } + } + + private void UpdateCorrelation(SAMRecord read){ + //Updates correlation table with SNPs from specific read (for phasing) + String s = cigarparser.FormatRead(read.getCigarString(), read.getReadString()); + ArrayList SNPsInRead = new ArrayList(); + ArrayList readindex = new ArrayList(); + + int readstart = read.getAlignmentStart(); + int readend = read.getAlignmentEnd(); + int numPositions = PolymorphicSites.length; + char c1, c2; + int a, b, i, j, SNPcount = 0; + + //Find all SNPs in read + for (i = 0; i < numPositions; i++){ + if (PolymorphicSites[i] > readstart && PolymorphicSites[i] < readend){ + SNPnumInRead[i] = SNPcount; + SNPposInRead[i] = PolymorphicSites[i]-readstart; + SNPcount++; + }else{ + SNPnumInRead[i] = -1; + SNPposInRead[i] = -1; + } + } + + //Update correlation table; for each combination of SNP positions + for (i = 0; i < numPositions; i++){ + if (SNPnumInRead[i] > -1){ + c1 = s.charAt(SNPposInRead[i]); + if (IndexOf(c1) > -1){ + for (j = i+1; j < numPositions; j ++){ + if (SNPnumInRead[j] > -1){ + c2 = s.charAt(SNPposInRead[j]); + if (IndexOf(c2) > -1){ + a = i*5 + IndexOf(c1); + b = j*5 + IndexOf(c2); + + numObservations[a][b]++; + totalObservations[i][j]++; + if (DEBUG){ + out.printf("DEBUG %s\t%s %s\t[i=%s,j=%s]\t[%s,%s]\t[%s,%s]\n",read.getReadName(),PolymorphicSites[i],PolymorphicSites[j],i,j,c1,c2,a,b); + } + } + } + } + + } + } + } + } + + private double CalculatePhaseLikelihood(int alleleIndex1, int alleleIndex2, boolean PRINTDEBUG){ + //calculate the likelihood that the particular combination of alleles satisfies the phase count data + double likelihood = 0, prob = 0; + int readstart1 = HLAstartpos[alleleIndex1]; int readend1 = HLAstoppos[alleleIndex1]; + int readstart2 = HLAstartpos[alleleIndex2]; int readend2 = HLAstoppos[alleleIndex2]; + int combinedstart = Math.max(readstart1,readstart2); + int combinedstop = Math.min(readend1,readend2); + + int numPositions = PolymorphicSites.length, SNPcount = 0; + int i, j, a1, a2, b1, b2; + char c11, c12, c21, c22; + int numInPhase = 0, numOutOfPhase; + + + //Find all SNPs in read + for (i = 0; i < numPositions; i++){ + if (PolymorphicSites[i] > combinedstart && PolymorphicSites[i] < combinedstop){ + SNPnumInRead[i] = SNPcount; + SNPposInRead[i] = PolymorphicSites[i]-combinedstart; + SNPcount++; + }else{ + SNPnumInRead[i] = -1; + SNPposInRead[i] = -1; + } + } + + String s1 = HLAreads[alleleIndex1]; + String s2 = HLAreads[alleleIndex2]; + //Iterate through every pairwise combination of SNPs, and update likelihood for the allele combination + for (i = 0; i < numPositions; i++){ + if (SNPnumInRead[i] > -1){ + c11 = s1.charAt(SNPposInRead[i]); + c21 = s2.charAt(SNPposInRead[i]); + if (IndexOf(c11) > -1 && IndexOf(c21) > -1){ + for (j = i+1; j < numPositions; j ++){ + if (SNPnumInRead[j] > -1 && totalObservations[i][j] > 0){ + c12 = s1.charAt(SNPposInRead[j]); + c22 = s2.charAt(SNPposInRead[j]); + if (IndexOf(c12) > -1 && IndexOf(c22) > -1){ + a1 = i*5 + IndexOf(c11); + b1 = j*5 + IndexOf(c12); + a2 = i*5 + IndexOf(c21); + b2 = j*5 + IndexOf(c22); + //check if the two alleles are identical at the chosen 2 locations + if ((c11 == c21) && (c12 == c22)){ + numInPhase = numObservations[a1][b1]; + }else{ + numInPhase = numObservations[a1][b1] + numObservations[a2][b2]; + } + prob = Math.max((double) numInPhase / (double) totalObservations[i][j], 0.0001); + likelihood += Math.log10(prob); + + if (PRINTDEBUG){ + out.printf("DEBUG %s %s %s[%s%s] %s[%s%s]\t[%s,%s]\t[%s,%s] [%s,%s]\t%s / %s\t%.4f\t%.2f\n",HLAnames[alleleIndex1],HLAnames[alleleIndex2],PolymorphicSites[i],c11,c21,PolymorphicSites[j],c12,c22, i,j,a1,b1,a2,b2,numInPhase,totalObservations[i][j],prob,likelihood); + } + } + } + } + } + } + } + return likelihood; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index 175333fbf..77814fdd2 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -6,6 +6,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -35,21 +36,39 @@ public class CallHLAWalker extends LocusWalker>{ @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) public boolean suppressPrinting = false; + @Argument(fullName = "debugHLA", shortName = "debugHLA", doc = "Print debug", required = false) + public boolean DEBUG = false; + + @Argument(fullName = "debugAlleles", shortName = "debugAlleles", doc = "Print likelihood scores for these alleles", required = false) + public String inputAlleles = ""; + + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "Caucasian"; + + @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) + public String filterFile = ""; + + String al1 = "", al2 = "", al3 = "", al4 = ""; + + //String HLAdatabaseFile = "/Users/shermanjia/Work/HLA.sam"; - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam"; + //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.4digitUnique.sam"; + String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; + //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.sam"; //String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.4digitUnique.sam"; //String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/CEU_Founders_HLA.freq"; String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; - + String AlleleFrequencyFile; ArrayList HLAreads = new ArrayList(); ArrayList HLAcigars = new ArrayList(); ArrayList HLAnames = new ArrayList(); ArrayList HLApositions = new ArrayList(); + ArrayList ReadsToFilter = new ArrayList(); ArrayList AllReads = new ArrayList(); ArrayList AllReadNames = new ArrayList(); @@ -82,13 +101,27 @@ public class CallHLAWalker extends LocusWalker>{ boolean DatabaseLoaded = false; boolean PrintedOutput = false; - boolean DEBUG = false; public Pair reduceInit() { - //Load sequences corresponding to HLA alleles from sam file if (!DatabaseLoaded){ try{ + //Load sequences corresponding to HLA alleles from sam file + if (!inputAlleles.equals("")){ + String[] str = inputAlleles.split(","); + al1 = str[0]; + al2 = str[1]; + al3 = str[2]; + al4 = str[3]; + } + + //set ethnic group to look up allele frequencies + if (ethnicity.equals("Black")){ + AlleleFrequencyFile = BlackAlleleFrequencyFile; + }else{ + AlleleFrequencyFile = CaucasianAlleleFrequencyFile; + } + out.printf("Reading HLA database ..."); FileInputStream fstream = new FileInputStream(HLAdatabaseFile); DataInputStream in = new DataInputStream(fstream); @@ -140,23 +173,23 @@ public class CallHLAWalker extends LocusWalker>{ CombinedAlleleFrequencies[i][j]=0.0; } //For debugging: get index for specific alleles - if (HLAnames.get(i).equals("HLA_A*3101")) + if (HLAnames.get(i).equals("HLA_" + al1)) j1 = i; - if (HLAnames.get(i).equals("HLA_A*3201")) + if (HLAnames.get(i).equals("HLA_" + al2)) k1 = i; - if (HLAnames.get(i).equals("HLA_A*3121")) + if (HLAnames.get(i).equals("HLA_" + al3)) j2 = i; - if (HLAnames.get(i).equals("HLA_A*3207")) + if (HLAnames.get(i).equals("HLA_" + al4)) k2 = i; } out.printf("DONE! Read %s alleles\n",HLAreads.size()); }catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); + System.err.println("CallHLAWalker Error: " + e.getMessage()); } try{ out.printf("Reading allele frequences ..."); - FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile); + FileInputStream fstream = new FileInputStream(AlleleFrequencyFile); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String strLine; String [] s = null; @@ -170,9 +203,31 @@ public class CallHLAWalker extends LocusWalker>{ in.close(); out.printf("Done! Read %s alleles\n",count); }catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); + System.err.println("CallHLAWalker Error: " + e.getMessage()); } + if (!filterFile.equals("")){ + try{ + out.printf("Reading reads to filter ..."); + FileInputStream fstream = new FileInputStream(filterFile); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + int count = 0; + while ((strLine = br.readLine()) != null){ + s = strLine.split("\\t"); + if (Integer.valueOf(s[6]) > 10){ + ReadsToFilter.add(s[0]); + } + count++; + } + in.close(); + out.printf("Done! %s reads to exclude\n",count); + }catch (Exception e){//Catch exception if any + System.err.println("CallHLAWalker Error: " + e.getMessage()); + } + } DatabaseLoaded = true; out.printf("Comparing reads to database ...\n"); @@ -283,21 +338,27 @@ public class CallHLAWalker extends LocusWalker>{ SAMRecord read; int offset; char base; byte qual; int mapquality; String readname; //Check for bad bases and ensure mapping quality myself. This works. - for (int i = 0; i < context.getReads().size(); i++) { - read = context.getReads().get(i); - offset = context.getOffsets().get(i); + for (int i = 0; i < reads.size(); i++) { + read = reads.get(i); + offset = offsets.get(i); base = read.getReadString().charAt(offset); qual = read.getBaseQualities()[offset]; mapquality = read.getMappingQuality(); if (mapquality >= 5 && BaseUtils.simpleBaseToBaseIndex(base) != -1) { - //consider base in likelihood calculations if it looks good and has high mapping score - G.add(base, qual, read, offset); - readname = read.getReadName(); - if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);} - if (base == 'A'){numAs++; depth++;} - else if (base == 'C'){numCs++; depth++;} - else if (base == 'T'){numTs++; depth++;} - else if (base == 'G'){numGs++; depth++;} + if (ReadsToFilter.contains(read.getReadName())){ + if (DEBUG){ + out.printf("\n%s %s %s %s\n",read.getReadName(),read.getAlignmentStart(),read.getAlignmentEnd(),base); + } + }else{ + //consider base in likelihood calculations if it looks good and has high mapping score + G.add(base, qual, read, offset); + readname = read.getReadName(); + if (!AllReadNames.contains(readname)){AllReadNames.add(readname); AllReads.add(read);} + if (base == 'A'){numAs++; depth++;} + else if (base == 'C'){numCs++; depth++;} + else if (base == 'T'){numTs++; depth++;} + else if (base == 'G'){numGs++; depth++;} + } } } @@ -402,9 +463,9 @@ public class CallHLAWalker extends LocusWalker>{ } if ( DEBUG ){ //Debugging: print updated likelihoods for 2 sets of HLA alleles, as well as normalized likelihoods for all 10 genotypes - out.printf("LOD[%s%s]:%5.1fL:%5.1fLOD[%s%s]:%5.1fL:%5.1f\t",r1,r2,LOD[j1][k1],LikelihoodScores[j1][k1],s1,s2,LOD[j2][k2],LikelihoodScores[j2][k2]); + out.printf("Likelihoods %s%s[%5.1f] %s%s[%5.1f]\t",r1,r2,LikelihoodScores[j1][k1],s1,s2,LikelihoodScores[j2][k2]); for ( DiploidGenotype g : DiploidGenotype.values() ) { - out.printf("%s %5.1f\t",g.toString(),Scores.get(g.toString())); + out.printf("%s[%5.1f] ",g.toString(),Scores.get(g.toString())); } out.printf("\n"); } @@ -596,10 +657,6 @@ public class CallHLAWalker extends LocusWalker>{ return prob; } - public boolean isReduceByInterval() { - return true; - } - public Pair reduce(Integer value, Pair sum) { long left = value.longValue() + sum.getFirst(); long right = sum.getSecond() + 1l; @@ -653,7 +710,7 @@ public class CallHLAWalker extends LocusWalker>{ for (Integer i = 0; i < numHLAlleles; i++){ for (Integer j = i; j < numHLAlleles; j++){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1 && maxA > 0){ - if (maxA - LOD[i][j] <= 5 && maxA >= LOD[i][j]){ + if (maxA - LOD[i][j] <= 10 && maxA >= LOD[i][j]){ inverseMaxProbA = inverseMaxProbA + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA); if (!TopAlleles.contains(i)){TopAlleles.add(i);} if (!TopAlleles.contains(j)){TopAlleles.add(j);} @@ -662,7 +719,7 @@ public class CallHLAWalker extends LocusWalker>{ } } } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1 && maxB > 0){ - if (maxB - LOD[i][j] <= 5 && maxB - LOD[i][j] >= 0){ + if (maxB - LOD[i][j] <= 10 && maxB - LOD[i][j] >= 0){ inverseMaxProbB = inverseMaxProbB + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB); if (!TopAlleles.contains(i)){TopAlleles.add(i);} if (!TopAlleles.contains(j)){TopAlleles.add(j);} @@ -671,7 +728,7 @@ public class CallHLAWalker extends LocusWalker>{ } } } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1 && maxC > 0){ - if (maxC - LOD[i][j] <= 5 && maxC - LOD[i][j] >= 0){ + if (maxC - LOD[i][j] <= 10 && maxC - LOD[i][j] >= 0){ inverseMaxProbC = inverseMaxProbC + java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC); if (!TopAlleles.contains(i)){TopAlleles.add(i);} if (!TopAlleles.contains(j)){TopAlleles.add(j);} @@ -754,11 +811,11 @@ public class CallHLAWalker extends LocusWalker>{ for (Integer i = 0; i < numHLAlleles; i++){ for (Integer j = i; j < numHLAlleles; j++){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if ((LOD[i][j] >= maxA - 5) && LOD[i][j] > 0){ + if ((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){ PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)]; if (PhasingScores[i][j] > maxAphase){maxAphase = PhasingScores[i][j];} - alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;} - alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;} + alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;} + alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;} SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2; if (CombinedAlleleFrequencies[i][j] > maxAfreq){maxAfreq = CombinedAlleleFrequencies[i][j];} likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA; @@ -767,11 +824,11 @@ public class CallHLAWalker extends LocusWalker>{ if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbA){maxProbA = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];} } } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if ((LOD[i][j] >= maxB - 5) && LOD[i][j] > 0){ + if ((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){ PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)]; if (PhasingScores[i][j] > maxBphase){maxBphase = PhasingScores[i][j];} - alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;} - alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;} + alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;} + alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;} SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2; if (freq1*freq2 > maxBfreq){maxBfreq = freq1*freq2;} likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB; @@ -780,12 +837,11 @@ public class CallHLAWalker extends LocusWalker>{ if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] > maxProbB){maxProbB = likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j];} } } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if ((LOD[i][j] >= maxC - 5)&& LOD[i][j] > 0){ - + if ((LOD[i][j] >= maxC - 10)&& LOD[i][j] > 0){ PhasingScores[i][j]= SinglePhaseScores[TopAlleles.indexOf(i)] * SinglePhaseScores[TopAlleles.indexOf(j)]; if (PhasingScores[i][j] > maxCphase){maxCphase = PhasingScores[i][j];} - alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.0001;} - alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.0001;} + alleleA=HLAnames.get(i).substring(4); if (AlleleFrequencies.containsKey(alleleA)){freq1 = Double.parseDouble((String) AlleleFrequencies.get(alleleA).toString());}else{freq1=0.00001;} + alleleB=HLAnames.get(j).substring(4); if (AlleleFrequencies.containsKey(alleleB)){freq2 = Double.parseDouble((String) AlleleFrequencies.get(alleleB).toString());}else{freq2=0.00001;} SingleAlleleFrequencies[i]=freq1; SingleAlleleFrequencies[j]=freq2; CombinedAlleleFrequencies[i][j]=freq1*freq2; if (freq1*freq2 > maxCfreq){maxCfreq = freq1*freq2;} likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC; @@ -808,7 +864,7 @@ public class CallHLAWalker extends LocusWalker>{ for (Integer i = 0; i < numHLAlleles; i++){ for (Integer j = i; j < numHLAlleles; j++){ if (HLAnames.get(i).indexOf("HLA_A") > -1 && HLAnames.get(j).indexOf("HLA_A") > -1){ - if((LOD[i][j] >= maxA - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase + if((LOD[i][j] >= maxA - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxAphase likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodA)/inverseMaxProbA; //out.printf("%s\t%s\tloglikelihood=%5.0f\tmax=%5.0f\tinvP=%.2f\tPrior=%.3f\tPhase=%.3f\tfreq1=%s\tfreq2=%s\tf1*f2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],maxlikelihoodA,inverseMaxProbA,likelihoodPrior,PhasingScores[i][j],SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]/ProbSumA); out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumA,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumA); @@ -816,14 +872,14 @@ public class CallHLAWalker extends LocusWalker>{ out.printf("\n"); } } else if (HLAnames.get(i).indexOf("HLA_B") > -1 && HLAnames.get(j).indexOf("HLA_B") > -1){ - if((LOD[i][j] >= maxB - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase + if((LOD[i][j] >= maxB - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxBphase likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodB)/inverseMaxProbB; out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumB,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumB); if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbB){out.printf("\tBEST");} out.printf("\n"); } } else if (HLAnames.get(i).indexOf("HLA_C") > -1 && HLAnames.get(j).indexOf("HLA_C") > -1){ - if((LOD[i][j] >= maxC - 5) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase + if((LOD[i][j] >= maxC - 10) && LOD[i][j] > 0){ // && PhasingScores[i][j] == maxCphase likelihoodPrior = java.lang.Math.pow(10,LikelihoodScores[i][j]-maxlikelihoodC)/inverseMaxProbC; out.printf("%s\t%s\tloglikelihood=%5.0f\tP(SSG)=%.3f\tP(Phase)=%.3f\tF1=%s\tF2=%s\tF1*F2=%.8f\tProb=%.3f",HLAnames.get(i),HLAnames.get(j),LikelihoodScores[i][j],likelihoodPrior,PhasingScores[i][j]/PhaseSumC,SingleAlleleFrequencies[i],SingleAlleleFrequencies[j],CombinedAlleleFrequencies[i][j],likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j]/ProbSumC); if (likelihoodPrior*CombinedAlleleFrequencies[i][j]*PhasingScores[i][j] == maxProbC){out.printf("\tBEST");} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CigarParser.java similarity index 76% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CigarParser.java index e04a70fe3..5dc0e22fa 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ReadCigarFormatter.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CigarParser.java @@ -12,10 +12,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; * * @author shermanjia */ -public class ReadCigarFormatter { +public class CigarParser { + String formattedRead; + + public String GetFormattedRead(){ + return formattedRead; + } + + public String FormatRead(String cigar, String read){ // returns a cigar-formatted sequence (removes insertions, inserts 'D' to where deletions occur - String formattedRead = ""; char c; String count; + formattedRead = ""; + + char c; String count; int cigarPlaceholder = 0; int subcigarLength = 0; int readPlaceholder = 0; int subreadLength = 0; @@ -43,15 +52,25 @@ public class ReadCigarFormatter { //increment placeholders without adding inserted bases to sequence (effectively removes insertion). cigarPlaceholder = i+1; readPlaceholder = readPlaceholder + subreadLength; - } else if (c == 'H' || c == 'S'){ - //(H = Headers or S = Soft clipped removed here)*** + } else if (c == 'H'){ + //(H = hard clip) - //If reaches H for insertion, get number before 'H' and skip that many characters in sequence + //If reaches H for hard clip, simply carry on count = cigar.substring(cigarPlaceholder, i); subreadLength = Integer.parseInt(count); //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). cigarPlaceholder = i+1; + } else if (c == 'S'){ + //(S = Soft clipped bases discarded here)*** + + //If reaches S for soft clip, get number before 'S' and skip that many characters in sequence + count = cigar.substring(cigarPlaceholder, i); + subreadLength = Integer.parseInt(count); + + //increment cigar placeholder without adding inserted bases to sequence (effectively removes insertion). + cigarPlaceholder = i+1; + readPlaceholder = readPlaceholder + subreadLength; } else if (c == 'D'){ //If reaches D for deletion, insert 'D' into sequence as placeholder count = cigar.substring(cigarPlaceholder, i); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java index 4732cbc98..ea8bf8575 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CreateHaplotypesWalker.java @@ -4,25 +4,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter; import org.broadinstitute.sting.gatk.walkers.*; - -import java.io.FileInputStream; -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; import java.util.Hashtable; -import java.util.List; -import java.lang.Math; /** * * @author shermanjia */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CreateHaplotypesWalker extends ReadWalker { - ReadCigarFormatter formatter = new ReadCigarFormatter(); + CigarParser formatter = new CigarParser(); char c; boolean DEBUG = false; int HLA_A_start = 30018310; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java index e894335c7..17d644be4 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java @@ -5,142 +5,103 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; -import java.io.FileInputStream; -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Hashtable; -import java.util.List; -import java.lang.Math; /** * * @author shermanjia */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class FindClosestAlleleWalker extends ReadWalker { - String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.sam"; - String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false) + public String debugRead = ""; + + @Argument(fullName = "findFirst", shortName = "findFirst", doc = "For each read, stop when first HLA allele is found with concordance = 1", required = false) + public boolean findFirst = false; + @Argument(fullName = "debugAllele", shortName = "debugAllele", doc = "Print match score for allele", required = false) + public String debugAllele = ""; + + @Argument(fullName = "ethnicity", shortName = "ethnicity", doc = "Use allele frequencies for this ethnic group", required = false) + public String ethnicity = "Caucasian"; + + @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) + public boolean ONLYFREQUENT = false; + + String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA.nuc.imputed.4digit.sam"; + SAMFileReader HLADictionaryReader = new SAMFileReader(); + + String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_CaucasiansUSA.freq"; + String BlackAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_BlackUSA.freq"; + String AlleleFrequencyFile; + + String PolymorphicSitesFile = "/humgen/gsa-scr1/GSA/sjia/Sting/HLA.polymorphic.sites"; + boolean DatabaseLoaded = false; boolean DEBUG = false; - ArrayList HLAreads = new ArrayList(); - ArrayList HLAcigars = new ArrayList(); - ArrayList HLAnames = new ArrayList(); - ArrayList HLApositions = new ArrayList(); + String[] HLAnames, HLAreads; + Integer[] HLAstartpos, HLAstoppos, PolymorphicSites,NonPolymorphicSites; double[] SingleAlleleFrequencies; + double[] nummatched, concordance, numcompared; int numHLAlleles = 0; - int[] HLAstartpos; - int[] HLAstoppos; int minstartpos = 0; int maxstoppos = 0; int HLA_A_start = 30018310; int HLA_A_end = 30021211; - ArrayList PolymorphicSites = new ArrayList(); - Hashtable AlleleFrequencies = new Hashtable(); int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; - ReadCigarFormatter formatter = new ReadCigarFormatter(); + CigarParser formatter = new CigarParser(); public Integer reduceInit() { - if (!DatabaseLoaded){ - try{ - out.printf("Reading HLA database ...\n"); - FileInputStream fstream = new FileInputStream(HLAdatabaseFile); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - int i = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - - if (s.length>=10){ - //Parse the reads with cigar parser - HLAreads.add(formatter.FormatRead(s[5],s[9])); - HLAcigars.add(s[5]); - HLAnames.add(s[0]); - - HLApositions.add(s[3]); - if (s[0].indexOf("HLA_A") > -1){ - if (iAstart < 0){iAstart=i;} - iAstop = i; i++; - }else if (s[0].indexOf("HLA_B") > -1){ - if (iBstart < 0){iBstart=i;} - iBstop = i; i++; - }else if (s[0].indexOf("HLA_C") > -1){ - if (iCstart < 0){iCstart=i;} - iCstop = i; i++; - } - } - } - in.close(); - int n = HLApositions.size(); numHLAlleles = n; - HLAstartpos = new int[n]; HLAstoppos = new int[n]; - SingleAlleleFrequencies = new double[n]; - - - for (i = 0; i < n; i++){ - //Find start and stop positions for each allele - HLAstartpos[i]=Integer.parseInt(HLApositions.get(i)); - HLAstoppos[i]=HLAstartpos[i]+HLAreads.get(i).length()-1; - if (minstartpos == 0){minstartpos = HLAstartpos[i];} - minstartpos = Math.min(minstartpos, HLAstartpos[i]); - maxstoppos = Math.max(maxstoppos, HLAstoppos[i]); - SingleAlleleFrequencies[i]=0.0; - //Initialize matrix of probabilities / likelihoods - - } - out.printf("DONE! Read %s alleles\n",HLAreads.size()); - }catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); - } - - try{ - out.printf("Reading allele frequences ..."); - FileInputStream fstream = new FileInputStream(CaucasianAlleleFrequencyFile); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; String [] s = null; - //Read File Line By Line - int count = 0; - while ((strLine = br.readLine()) != null) { - s = strLine.split("\\t"); - AlleleFrequencies.put(s[0].substring(0, 6), s[1]); - count++; - } - in.close(); - out.printf("Done! Read %s alleles\n",count); - }catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); - } - - char c; + if (!DatabaseLoaded){ DatabaseLoaded = true; - //Find polymorphic sites in dictionary - for (int pos = minstartpos; pos <= maxstoppos; pos++){ - c = '0'; - for (int i = 0; i < HLAreads.size(); i++){ - if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){ - if (c == '0'){c = HLAreads.get(i).charAt(pos-HLAstartpos[i]);} - if (HLAreads.get(i).charAt(pos-HLAstartpos[i]) != c){ - PolymorphicSites.add(String.valueOf(pos)); - break; - } - } - } - } - out.printf("%s polymorphic sites found in HLA dictionary\n",PolymorphicSites.size()); - out.printf("Comparing reads to database ...\n"); + //Load HLA dictionary + out.printf("INFO Loading HLA dictionary ... "); + + HLADictionaryReader.ReadFile(HLAdatabaseFile); + HLAreads = HLADictionaryReader.GetReads(); + HLAnames = HLADictionaryReader.GetReadNames(); + HLAstartpos = HLADictionaryReader.GetStartPositions(); + HLAstoppos = HLADictionaryReader.GetStopPositions(); + minstartpos = HLADictionaryReader.GetMinStartPos(); + maxstoppos = HLADictionaryReader.GetMaxStopPos(); + + out.printf("Done! %s HLA alleles loaded.\n",HLAreads.length); + + nummatched = new double[HLAreads.length]; + concordance = new double[HLAreads.length]; + numcompared = new double[HLAreads.length]; + + //Read allele frequencies + if (ethnicity.equals("Black")){ + AlleleFrequencyFile = BlackAlleleFrequencyFile; + }else{ + AlleleFrequencyFile = CaucasianAlleleFrequencyFile; + } + out.printf("INFO Reading HLA allele frequencies ... "); + FrequencyFileReader HLAfreqReader = new FrequencyFileReader(); + HLAfreqReader.ReadFile(AlleleFrequencyFile); + AlleleFrequencies = HLAfreqReader.GetAlleleFrequencies(); + out.printf("Done! Frequencies for %s HLA alleles loaded.\n",AlleleFrequencies.size()); + + //FindPolymorphicSites(minstartpos,maxstoppos); + + PolymorphicSitesFileReader siteFileReader = new PolymorphicSitesFileReader(); + siteFileReader.ReadFile(PolymorphicSitesFile); + PolymorphicSites = siteFileReader.GetPolymorphicSites(); + NonPolymorphicSites = siteFileReader.GetNonPolymorphicSites(); + + + out.printf("INFO %s polymorphic and %s non-polymorphic sites found in HLA dictionary\n",PolymorphicSites.length,NonPolymorphicSites.length); + out.printf("INFO Comparing reads to database ...\n"); if (DEBUG){ //out.printf("Astart[%s]\tAstop[%s]\tBstart[%s]\tBstop[%s]\tCstart[%s]\tCstop[%s]\tnumAlleles[%s]\n",iAstart,iAstop,iBstart,iBstop,iCstart,iCstop,numHLAlleles); @@ -149,97 +110,148 @@ public class FindClosestAlleleWalker extends ReadWalker { return 0; } + private void FindPolymorphicSites(int start, int stop){ + boolean initialized, polymorphic, examined; + char c = ' '; + ArrayList polymorphicsites = new ArrayList(); + ArrayList nonpolymorphicsites = new ArrayList(); + //Find polymorphic sites in dictionary + for (int pos = start; pos <= stop; pos++){ + initialized = false; polymorphic = false; examined = false; + //look across all alleles at specific position to see if it is polymorphic + for (int i = 0; i < HLAreads.length; i++){ + if (pos >= HLAstartpos[i] && pos <= HLAstoppos[i]){ + if (!initialized){ + c = HLAreads[i].charAt(pos-HLAstartpos[i]); + initialized = true; + examined = true; + } + if (HLAreads[i].charAt(pos-HLAstartpos[i]) != c){ + polymorphicsites.add(pos); + out.printf("POLYMORPHIC\t6\t%s\n", pos); + polymorphic = true; + break; + } + } - public Integer map(char[] ref, SAMRecord read) { + } + if (!polymorphic && examined){ + nonpolymorphicsites.add(pos); + out.printf("CONSERVED\t6\t%s\n", pos); + } + } + PolymorphicSites = polymorphicsites.toArray(new Integer[polymorphicsites.size()]); + NonPolymorphicSites = nonpolymorphicsites.toArray(new Integer[nonpolymorphicsites.size()]); + } + + private double CalculateConcordance(SAMRecord read){ int readstart = read.getAlignmentStart(); int readstop = read.getAlignmentEnd(); - double[] nummatched = new double[HLAreads.size()]; - double[] concordance = new double[HLAreads.size()]; - double[] numcompared = new double[HLAreads.size()]; - double maxConcordance = 0; - String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString()); + int numpolymorphicsites, numnonpolymorphicsites, pos; char c1, c2; - String s2 = "", name = ""; + double maxConcordance = 0.0, freq = 0.0, minFreq = 0.0; + String s1 = formatter.FormatRead(read.getCigarString(), read.getReadString()); + String s2; + int allelestart, allelestop; - //For every allele that overlaps with current read - for (int i = 0; i < HLAreads.size(); i++){ - nummatched[i] = 0; + numpolymorphicsites = PolymorphicSites.length; + numnonpolymorphicsites = NonPolymorphicSites.length; + + if (ONLYFREQUENT){ + minFreq = 0.0001; + } + + for (int i = 0; i < HLAreads.length; i++){ + nummatched[i] = 0; concordance[i] = 0; numcompared[i] = 0; + freq = GetAlleleFrequency(HLAnames[i]); //Get concordance between read and specific allele - if (DEBUG){ - //out.printf("%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames.get(i),readstart,readstop,HLAstartpos[i],HLAstoppos[i]); - } - if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i]){ - s2 = HLAreads.get(i); - for (int j = read.getAlignmentStart(); j <= read.getAlignmentEnd(); j++){ - c1 = s1.charAt(j-readstart); - c2 = s2.charAt(j-HLAstartpos[i]); - if (DEBUG){ - //out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames.get(i),j,j-readstart,j-HLAstartpos[i],c1,c2); - } - if (c1 != 'D'){ - if (PolymorphicSites.contains(String.valueOf(j))){ + if (readstart <= HLAstoppos[i] && readstop >= HLAstartpos[i] && freq > minFreq){ + s2 = HLAreads[i]; + + allelestart = HLAstartpos[i]; + allelestop = HLAstoppos[i]; + + //Polymorphic sites: always increment denominator, increment numerator when bases are concordant + for (int j = 0; j < numpolymorphicsites; j++){ + pos = PolymorphicSites[j]; + if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ + c1 = s1.charAt(pos-readstart); + c2 = s2.charAt(pos-allelestart); + if (c1 != 'D'){ numcompared[i]++; if (c1 == c2){ nummatched[i]++; + }else{ + if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ + out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); + } } - }else if (c1 != c2){ + } + } + } + + //Non-polymorphic sites: increment denominator only when bases are discordant + + for (int j = 0; j < numnonpolymorphicsites; j++){ + pos = NonPolymorphicSites[j]; + if (pos >= readstart && pos <= readstop && pos >= allelestart && pos <= allelestop){ + c1 = s1.charAt(pos-readstart); + c2 = s2.charAt(pos-allelestart); + if (c1 != c2){ numcompared[i]++; + if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ + out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(), HLAnames[i], j, c1,c2); + } } } } } + + //Update concordance array concordance[i]=nummatched[i]/numcompared[i]; - if (concordance[i] > maxConcordance){maxConcordance = concordance[i];} + if (debugRead.equals(read.getReadName()) && debugAllele.equals(HLAnames[i])){ + out.printf("%s\t%s\t%s\t%s\t%s\n",read.getReadName(),HLAnames[i],concordance[i],numcompared[i],numcompared[i]-nummatched[i]); + } + if (findFirst && (concordance[i] == 1)){ + break; + } + } - double freq, freq2, maxFreq = 0.0; - for (int i = 0; i < HLAreads.size(); i++){ + + return maxConcordance; + } + + private double FindMaxAlleleFrequency(double maxConcordance){ + //finds the max frequency of the alleles that share the maximum concordance with the read of interest + double freq, maxFreq = 0.0; + for (int i = 0; i < HLAreads.length; i++){ if (concordance[i] == maxConcordance && maxConcordance > 0){ - if (HLAnames.get(i).length() >= 10){ - name=HLAnames.get(i).substring(4,10); - }else{ - name=HLAnames.get(i).substring(4); - } - if (AlleleFrequencies.containsKey(name)){ - freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString()); - if (DEBUG){ - out.printf("%s\t%s\t%s\t%s\t%s\t%.4f\n",read.getReadName(),name,nummatched[i],numcompared[i],concordance[i],freq); - } - }else{ - freq=0.0001; - } + freq = GetAlleleFrequency(HLAnames[i]); if (freq > maxFreq){maxFreq = freq;} } } + return maxFreq; + } - if (read.getReadName().length() >= 10){ - name=read.getReadName().substring(4,10); - }else{ - name=read.getReadName().substring(4); - } + public Integer map(char[] ref, SAMRecord read) { + //Calculate concordance for this read and all overlapping reads + double maxConcordance = CalculateConcordance(read); + + String readname = read.getReadName(), allelename = ""; double freq; + //For input bam files that contain HLA alleles, find and print allele frequency + freq = GetAlleleFrequency(readname); + out.printf("%s\t%.4f", readname,freq); + + //Find the maximum frequency of the alleles most concordant with the read + double maxFreq = FindMaxAlleleFrequency(maxConcordance); - if (AlleleFrequencies.containsKey(name)){ - freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString()); - }else{ - freq=0.0001; - } - - out.printf("%s\t%.4f", read.getReadName(),freq); - for (int i = 0; i < HLAreads.size(); i++){ + //Print concordance statistics between this read and the most similar HLA allele(s) + for (int i = 0; i < HLAreads.length; i++){ if (concordance[i] == maxConcordance && maxConcordance > 0){ - if (HLAnames.get(i).length() >= 10){ - name=HLAnames.get(i).substring(4,10); - }else{ - name=HLAnames.get(i).substring(4); - } - if (AlleleFrequencies.containsKey(name)){ - freq = Double.parseDouble((String) AlleleFrequencies.get(name).toString()); - }else{ - freq=0.0001; - } + freq = GetAlleleFrequency(HLAnames[i]); if (freq == maxFreq){ - - out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames.get(i),freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); + out.printf("\t%s\t%.4f\t%.3f\t%.0f\t%.0f",HLAnames[i],freq,concordance[i],numcompared[i],numcompared[i]-nummatched[i]); } } } @@ -247,6 +259,22 @@ public class FindClosestAlleleWalker extends ReadWalker { return 1; } + private double GetAlleleFrequency(String allelename){ + double frequency = 0.0; + //Truncate names to 4-digit "A*0101" format + if (allelename.length() >= 10){ + allelename=allelename.substring(4,10); + }else{ + allelename=allelename.substring(4); + } + if (AlleleFrequencies.containsKey(allelename)){ + frequency = Double.parseDouble((String) AlleleFrequencies.get(allelename).toString()); + }else{ + frequency=0.0001; + } + return frequency; + } + public Integer reduce(Integer value, Integer sum) { return value + sum; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java new file mode 100644 index 000000000..bd585a2ac --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FrequencyFileReader.java @@ -0,0 +1,39 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.Hashtable; +/** + * + * @author shermanjia + */ +public class FrequencyFileReader { + Hashtable AlleleFrequencies = new Hashtable(); + + public Hashtable GetAlleleFrequencies(){ + return AlleleFrequencies; + } + + public void ReadFile(String filename){ + try{ + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + s = strLine.split("\\t"); + AlleleFrequencies.put(s[0], s[1]); + //System.out.printf("Loaded: %s\t%s\n",s[0],AlleleFrequencies.get(s[0]).toString()); + } + in.close(); + }catch (Exception e){//Catch exception if any + System.err.println("FrequencyFileReader Error: " + e.getMessage()); + } + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java index aeba3145a..1ba81b049 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/ImputeAllelesWalker.java @@ -5,8 +5,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.playground.gatk.walkers.HLAcaller.ReadCigarFormatter; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.*; import java.io.FileInputStream; @@ -15,8 +13,6 @@ import java.io.DataInputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Hashtable; -import java.util.List; -import java.lang.Math; /** * * @author shermanjia @@ -53,7 +49,7 @@ public class ImputeAllelesWalker extends ReadWalker { Hashtable ClosestAllele = new Hashtable(); int iAstart = -1, iAstop = -1, iBstart = -1, iBstop = -1, iCstart = -1, iCstop = -1; - ReadCigarFormatter formatter = new ReadCigarFormatter(); + CigarParser formatter = new CigarParser(); public Integer reduceInit() { if (!DatabaseLoaded){ @@ -106,7 +102,7 @@ public class ImputeAllelesWalker extends ReadWalker { } out.printf("DONE! Read %s alleles\n",HLAreads.size()); }catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); + System.err.println("ImputeAllelsWalker Error: " + e.getMessage()); } try{ @@ -126,7 +122,7 @@ public class ImputeAllelesWalker extends ReadWalker { in.close(); out.printf("Done! Read %s alleles\n",count); }catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); + System.err.println("ImputeAllelsWalker Error: " + e.getMessage()); } char c; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java new file mode 100644 index 000000000..a6d840821 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/PolymorphicSitesFileReader.java @@ -0,0 +1,49 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.ArrayList; +/** + * + * @author shermanjia + */ +public class PolymorphicSitesFileReader { + ArrayList PolymorphicSites = new ArrayList(); + ArrayList NonPolymorphicSites = new ArrayList(); + + + public Integer[] GetPolymorphicSites(){ + return PolymorphicSites.toArray(new Integer[PolymorphicSites.size()]); + } + + public Integer[] GetNonPolymorphicSites(){ + return NonPolymorphicSites.toArray(new Integer[NonPolymorphicSites.size()]); + } + + public void ReadFile(String filename){ + try{ + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + int i = 0; + while ((strLine = br.readLine()) != null) { + s = strLine.split("\\t"); + if (s[0].equals("POLYMORPHIC")){ + PolymorphicSites.add(Integer.valueOf(s[2])); + }else{ + NonPolymorphicSites.add(Integer.valueOf(s[2])); + } + } + in.close(); + }catch (Exception e){//Catch exception if any + System.err.println("PolymorphicSitesFileReader Error: " + e.getMessage()); + } + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SAMFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SAMFileReader.java new file mode 100644 index 000000000..f52a075ac --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SAMFileReader.java @@ -0,0 +1,94 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.ArrayList; +/** + * + * @author shermanjia + */ +public class SAMFileReader { + ArrayList ReadStrings = new ArrayList(); + ArrayList CigarStrings = new ArrayList(); + ArrayList ReadNames = new ArrayList(); + ArrayList ReadStartPositions = new ArrayList(); + ArrayList ReadStopPositions = new ArrayList(); + int minstartpos; + int maxstoppos; + + CigarParser formatter = new CigarParser(); + + public String[] GetReads(){ + return ReadStrings.toArray(new String[ReadStrings.size()]); + } + + public String[] GetReadNames(){ + return ReadNames.toArray(new String[ReadNames.size()]); + } + + public String[] GetCigarStrings(){ + return CigarStrings.toArray(new String[CigarStrings.size()]); + } + + public Integer[] GetStartPositions(){ + return ReadStartPositions.toArray(new Integer[ReadStartPositions.size()]); + } + + public Integer[] GetStopPositions(){ + return ReadStopPositions.toArray(new Integer[ReadStopPositions.size()]); + } + + public Integer GetMinStartPos(){ + return minstartpos; + } + + public Integer GetMaxStopPos(){ + return maxstoppos; + } + + public int GetReadIndex(String readname){ + if (ReadNames.contains(readname)){ + return ReadNames.indexOf(readname); + }else{ + return -1; + } + } + + public void ReadFile(String filename){ + try{ + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + int i = 0; + while ((strLine = br.readLine()) != null) { + s = strLine.split("\\t"); + if (s.length>=10){ + //Parse the reads with cigar parser + String read = formatter.FormatRead(s[5],s[9]); + ReadStrings.add(read); + CigarStrings.add(s[5]); + ReadNames.add(s[0]); + ReadStartPositions.add(Integer.valueOf(s[3])); + ReadStopPositions.add(Integer.valueOf(s[3]) + read.length() - 1); + if (i == 0){ + minstartpos = Integer.valueOf(s[3]); + maxstoppos = Integer.valueOf(Integer.valueOf(s[3]) + read.length() - 1); + } + minstartpos = Math.min(minstartpos, Integer.valueOf(s[3])); + maxstoppos = Math.max(maxstoppos, Integer.valueOf(Integer.valueOf(s[3]) + read.length() - 1)); + i++; + } + } + in.close(); + }catch (Exception e){//Catch exception if any + System.err.println("SAMFileReader Error: " + e.getMessage()); + } + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java new file mode 100644 index 000000000..2b94d1c7c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java @@ -0,0 +1,49 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.HLAcaller; + +import java.io.*; +import java.util.ArrayList; +/** + * + * @author shermanjia + */ +public class SimilarityFileReader { + ArrayList ReadsToDiscard = new ArrayList(); + + public ArrayList GetReadsToDiscard(){ + return ReadsToDiscard; + } + + public String[] GetReadsToDiscardArray(){ + return ReadsToDiscard.toArray(new String[ReadsToDiscard.size()]); + } + + public void ReadFile(String filename){ + try{ + FileInputStream fstream = new FileInputStream(filename); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; String [] s = null; + //Read File Line By Line + int i = 0; + while ((strLine = br.readLine()) != null) { + s = strLine.split("\\t"); + if (s.length >= 6){ + Double matchFraction = Double.valueOf(s[4]); + int numMismatches = Integer.valueOf(s[6]); + if ((matchFraction < 0.9 && numMismatches > 3) || (numMismatches >= 6)){ + ReadsToDiscard.add(s[0]); + } + } + } + in.close(); + }catch (Exception e){//Catch exception if any + System.err.println("SimilarityFile Error: " + e.getMessage()); + } + } +} +