diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ffcaad54c..12f55e60f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -63,7 +63,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 0e2ecb378..5d26b9e89 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,6 +27,9 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Input; + +import java.io.File; public class UnifiedArgumentCollection { @@ -101,6 +104,13 @@ public class UnifiedArgumentCollection { public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true; //gdebug+ @Hidden + // experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!! + @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) + public boolean GET_GAP_PENALTIES_FROM_DATA = false; + + @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") + public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); + @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @Hidden @@ -145,7 +155,9 @@ public class UnifiedArgumentCollection { uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES; - + + uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA; + uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE; // todo- arguments to remove uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT; uac.dovit = dovit; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index dd81570a1..4e328ded3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -30,17 +30,28 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum; +import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.text.XReadLines; +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Pattern; public class PairHMMIndelErrorModel { @@ -103,10 +114,30 @@ public class PairHMMIndelErrorModel { private final double[] GAP_OPEN_PROB_TABLE; private final double[] GAP_CONT_PROB_TABLE; + private boolean getGapPenaltiesFromFile = false; - static { + private int SMOOTHING = 1; + private int MAX_QUALITY_SCORE = 50; + private int PRESERVE_QSCORES_LESS_THAN = 5; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// +//copy+ + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation + private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + protected static final String EOF_MARKER = "EOF"; + private long numReadsWithMalformedColorSpace = 0; + private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. + +//copy- + static { LOG_ONE_HALF= -Math.log10(2.0); - END_GAP_COST = LOG_ONE_HALF; + END_GAP_COST = LOG_ONE_HALF; baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; @@ -119,6 +150,130 @@ public class PairHMMIndelErrorModel { } } + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit,boolean gpf, File RECAL_FILE) { + + this(indelGOP, indelGCP, deb, doCDP, dovit); + this.getGapPenaltiesFromFile = gpf; + + // read data from recal file + // gdebug - start copy from TableRecalibrationWalker + if (gpf) { + boolean sawEOF = false; + boolean REQUIRE_EOF = false; + + int lineNumber = 0; + boolean foundAllCovariates = false; + // Get a list of all available covariates + final List> classes = new PluginManager(Covariate.class).getPlugins(); + + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + lineNumber++; + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; + } else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { + ; // Skip over the comment lines, (which start with '#') + } + // Read in the covariates that were used from the input file + else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data + if( foundAllCovariates ) { + throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); + } else { // Found the covariate list in input file, loop through all of them and instantiate them + String[] vals = line.split(","); + for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical + boolean foundClass = false; + for( Class covClass : classes ) { + if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + foundClass = true; + try { + Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + + } + } + + if( !foundClass ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + } + } + } + + } else { // Found a line of data + if( !foundAllCovariates ) { + foundAllCovariates = true; + + // At this point all the covariates should have been found and initialized + if( requestedCovariates.size() < 2 ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); + } + + final boolean createCollapsedTables = true; + + // Initialize any covariate member variables using the shared argument collection + for( Covariate cov : requestedCovariates ) { + cov.initialize( RAC ); + } + // Initialize the data hashMaps + dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); + + } + addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } catch ( NumberFormatException e ) { + throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); + } + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; + if ( REQUIRE_EOF ) + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + } + + if( dataManager == null ) { + throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); + } + + // Create the tables of empirical quality scores that will be used in the sequential calculation + dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE ); + } + // debug end copy + + } + /** + * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * @param line A line of CSV data read from the recalibration table data file + */ + private void addCSVData(final File file, final String line) { + final String[] vals = line.split(","); + + // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly + if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + } + + final Object[] key = new Object[requestedCovariates.size()]; + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + key[iii] = cov.getValue( vals[iii] ); + } + + // Create a new datum using the number of observations, number of mismatches, and reported quality score + final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + // Add that datum to all the collapsed tables which will be used in the sequential calculation + dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); + } + + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) { this(indelGOP, indelGCP, deb, doCDP); this.doViterbi = dovit; @@ -152,11 +307,11 @@ public class PairHMMIndelErrorModel { for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) { gop += step; if (gop > maxGOP) - gop = maxGOP; + gop = maxGOP; gcp += step; if(gcp > maxGCP) - gcp = maxGCP; + gcp = maxGCP; GAP_OPEN_PROB_TABLE[i] = gop; GAP_CONT_PROB_TABLE[i] = gcp; } @@ -399,14 +554,20 @@ public class PairHMMIndelErrorModel { bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx]; // update X array - // State X(i,j): X(1:i) aligned to a gap in Y(1:j). + // State X(i,j): X(1:i) aligned to a gap in Y(1:j). // When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps //c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); //d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - c = currentGOP[jm1]; - d = currentGCP[jm1]; - if (indJ == Y_METRIC_LENGTH-1) + if (getGapPenaltiesFromFile) { + c = currentGOP[im1]; + d = logGapContinuationProbability; + + } else { + c = currentGOP[jm1]; + d = currentGCP[jm1]; + } + if (indJ == Y_METRIC_LENGTH-1) c = d = END_GAP_COST; if (doViterbi) { @@ -426,8 +587,14 @@ public class PairHMMIndelErrorModel { // update Y array //c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); //d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - c = currentGOP[jm1]; - d = currentGCP[jm1]; + if (getGapPenaltiesFromFile) { + c = currentGOP[im1]; + d = logGapContinuationProbability; + } + else { + c = currentGOP[jm1]; + d = currentGCP[jm1]; + } if (indI == X_METRIC_LENGTH-1) c = d = END_GAP_COST; @@ -511,7 +678,7 @@ public class PairHMMIndelErrorModel { i--; j--; } - } + } @@ -541,7 +708,7 @@ public class PairHMMIndelErrorModel { else { contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]]; contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[hrunProfile[i]]; - } + } } } public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, @@ -559,7 +726,7 @@ public class PairHMMIndelErrorModel { System.out.println(new String(ref.getBases())); } - if (doContextDependentPenalties) { + if (doContextDependentPenalties && !getGapPenaltiesFromFile) { // will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes. for (int j=0; j < haplotypesInVC.size(); j++) { @@ -584,23 +751,59 @@ public class PairHMMIndelErrorModel { } } for (SAMRecord pread : pileup.getReads()) { - SAMRecord read = ReadUtils.hardClipAdaptorSequence(pread); + SAMRecord read = ReadUtils.hardClipAdaptorSequence(pread); if (read == null) continue; - if(ReadUtils.is454Read(read)) { + if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) { continue; } - // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) - // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent - if (DEBUG) - System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), - read.getAlignmentStart(), - read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), - read.getCigarString()); + double[] recalQuals = null; + if (getGapPenaltiesFromFile) { + RecalDataManager.parseSAMRecord( read, RAC ); + recalQuals = new double[read.getReadLength()]; + + //compute all covariate values for this read + final Comparable[][] covariateValues_offset_x_covar = + RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); + // For each base in the read + for( int offset = 0; offset < read.getReadLength(); offset++ ) { + + final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; + + Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); + if(qualityScore == null) + { + qualityScore = performSequentialQualityCalculation( fullCovariateKey ); + qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); + } + + recalQuals[offset] = (double)qualityScore; + } + + // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) + // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent + if (DEBUG) { + System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), + read.getAlignmentStart(), + read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), + read.getCigarString()); + + byte[] bases = read.getReadBases(); + for (int k = 0; k < recalQuals.length; k++) { + System.out.format("%c",bases[k]); + } + System.out.println(); + + for (int k = 0; k < recalQuals.length; k++) { + System.out.format("%.0f ",recalQuals[k]); + } + System.out.println(); + } + } // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -641,12 +844,12 @@ public class PairHMMIndelErrorModel { */ // remove soft clips if necessary if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || - (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) { + (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) { numStartSoftClippedBases = 0; numEndSoftClippedBases = 0; } - + byte[] unclippedReadBases, unclippedReadQuals; @@ -715,6 +918,12 @@ public class PairHMMIndelErrorModel { byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases, unclippedReadBases.length-numEndClippedBases); + double[] recalCDP = null; + if (getGapPenaltiesFromFile) { + recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases, + unclippedReadBases.length-numEndClippedBases); + + } if (DEBUG) { System.out.println("Read bases:"); @@ -751,11 +960,17 @@ public class PairHMMIndelErrorModel { double[] currentContextGCP = null; if (doContextDependentPenalties) { - currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop); - currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop); - } - readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); + if (getGapPenaltiesFromFile) { + readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null); + + } else { + currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop); + currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop); + readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); + } + } + } else readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals); @@ -824,5 +1039,76 @@ public class PairHMMIndelErrorModel { } - + /** + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte + */ + private byte performSequentialQualityCalculation( final Object... key ) { + + final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); + double globalDeltaQ = 0.0; + if( globalRecalDatum != null ) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); + double deltaQReported = 0.0; + if( qReportedRecalDatum != null ) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for( int iii = 2; iii < key.length; iii++ ) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); + if( covariateRecalDatum != null ) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + //return newQualityByte; + } + } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CountCovariatesGatherer.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CountCovariatesGatherer.java new file mode 100755 index 000000000..3a50fbe4d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CountCovariatesGatherer.java @@ -0,0 +1,106 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import org.broadinstitute.sting.commandline.Gatherer; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.HashMap; +import java.util.List; +import java.util.regex.Pattern; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 3/29/11 + * Time: 3:54 PM + * To change this template use File | Settings | File Templates. + */ + + +public class CountCovariatesGatherer extends Gatherer { + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + private static final String EOF_MARKER = "EOF"; + + private HashMap dataMap; + + + private void addCSVData (String line) { + String[] covariates = line.split(","); + String key = ""; + RecalDatumOptimized values; + + for (int i = 0; i < covariates.length-3; i++) { + key += covariates[i] + ","; + } + + values = new RecalDatumOptimized(Integer.parseInt(covariates[covariates.length-3]), + Integer.parseInt(covariates[covariates.length-2])); + + if (dataMap.get(key) != null) { + RecalDatumOptimized currentValues = dataMap.get(key); + values.increment(currentValues); + } + + dataMap.put(key, values); + } + + @Override + public void gather(List inputs, File output) { + dataMap = new HashMap(); + PrintStream o; + try { + o = new PrintStream(output); + } catch ( FileNotFoundException e) { + throw new UserException("File to be output by CountCovariates Gather function was not found"); + } + + boolean sawEOF = false; + boolean printedHeader = false; + + // Read input files + for ( File RECAL_FILE : inputs) { + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; // sanity check + } + else if(COMMENT_PATTERN.matcher(line).matches()) { + ; // It doesn't make any sense to print intermediate comments, unless we merge them somehow (would require strict definition for the header) + } + else if (COVARIATE_PATTERN.matcher(line).matches()) { + if (!printedHeader) + o.println(line); + } + else { // Found a line of data + addCSVData(line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!"; + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + } + printedHeader = true; + } + + // Write output file from dataMap + for(String key : dataMap.keySet()) { + RecalDatumOptimized values = dataMap.get(key); + String v = values.getNumObservations() + "," + values.getNumMismatches() + "," + values.empiricalQualByte(); + o.println(key + v); + } + o.println("EOF"); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Covariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Covariate.java new file mode 100755 index 000000000..c84494908 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Covariate.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import net.sf.samtools.SAMRecord; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 30, 2009 + * + * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases + * In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code. + * This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up. + */ + +public interface Covariate { + public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers + public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate + //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows + //read-specific calculations to be done just once rather than for each offset. +} + +interface RequiredCovariate extends Covariate { +} + +interface StandardCovariate extends Covariate { +} + +interface ExperimentalCovariate extends Covariate { +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CycleCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CycleCovariate.java new file mode 100755 index 000000000..4e8565b6a --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/CycleCovariate.java @@ -0,0 +1,280 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import net.sf.samtools.SAMRecord; + +//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.Arrays; +import java.util.List; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 30, 2009 + * + * The Cycle covariate. + * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) + * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle + * For example, for the read: AAACCCCGAAATTTTTACTG + * the cycle would be 11111111222333333344 + * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round + */ + +public class CycleCovariate implements StandardCovariate { + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + if( RAC.DEFAULT_PLATFORM != null ) { + if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) || + RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) { + // nothing to do + } else { + throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid"); + } + } + } + + /* + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read, final int offset ) { + + int cycle = 1; + + //----------------------------- + // ILLUMINA and SOLID + //----------------------------- + + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" + read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" + cycle = offset + 1; + if( read.getReadNegativeStrandFlag() ) { + cycle = read.getReadLength() - offset; + } + } + + //----------------------------- + // 454 + //----------------------------- + + else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" + final byte[] bases = read.getReadBases(); + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + int iii = 0; + while( iii <= offset ) + { + while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; } + while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; } + while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; } + while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; } + if( iii <= offset ) { cycle++; } + if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; } + + } + } else { // Negative direction + int iii = bases.length-1; + while( iii >= offset ) + { + while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; } + while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; } + while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; } + while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; } + if( iii >= offset ) { cycle++; } + if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; } + } + } + } + + //----------------------------- + // SOLID (unused), only to be used in conjunction with PrimerRoundCovariate + //----------------------------- + + //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { + // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + // int pos = offset + 1; + // if( read.getReadNegativeStrandFlag() ) { + // pos = read.getReadLength() - offset; + // } + // cycle = pos / 5; // integer division + //} + + //----------------------------- + // UNRECOGNIZED PLATFORM + //----------------------------- + + else { // Platform is unrecognized so revert to the default platform but warn the user first + if( defaultPlatform != null) { // The user set a default platform + if( !warnedUserBadPlatform ) { + Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + + "Defaulting to platform = " + defaultPlatform + "." ); + } + warnedUserBadPlatform = true; + + read.getReadGroup().setPlatform( defaultPlatform ); + return getValue( read, offset ); // A recursive call + } else { // The user did not set a default platform + throw new StingException( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + + "No default platform specified. Users must set the default platform using the --default_platform argument." ); + } + } + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + cycle *= -1; + } + + return cycle; + } + */ + + // todo -- this should be put into a common place in the code base + private static List PACBIO_NAMES = Arrays.asList("PACBIO"); + private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); + private static List SOLID_NAMES = Arrays.asList("SOLID"); + private static List LS454_NAMES = Arrays.asList("454"); + + private static boolean isPlatform(SAMRecord read, List names) { + String pl = read.getReadGroup().getPlatform().toUpperCase(); + for ( String name : names ) + if ( pl.contains( name ) ) + return true; + return false; + } + + // Used to pick out the covariate's value from attributes of the read + public void getValues(SAMRecord read, Comparable[] comparable) { + + //----------------------------- + // ILLUMINA and SOLID + //----------------------------- + + + if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) { + final int init; + final int increment; + if( !read.getReadNegativeStrandFlag() ) { + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + //second of pair, positive strand + init = -1; + increment = -1; + } + else + { + //first of pair, positive strand + init = 1; + increment = 1; + } + + } else { + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + //second of pair, negative strand + init = -read.getReadLength(); + increment = 1; + } + else + { + //first of pair, negative strand + init = read.getReadLength(); + increment = -1; + } + } + + int cycle = init; + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = cycle; + cycle += increment; + } + } + else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454" + + final int readLength = read.getReadLength(); + final byte[] bases = read.getReadBases(); + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); + + int cycle = multiplyByNegative1 ? -1 : 1; + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + int iii = 0; + while( iii < readLength ) + { + while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; } + if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; } + if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; } + + } + } else { // Negative direction + int iii = readLength-1; + while( iii >= 0 ) + { + while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; } + if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; } + if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; } + } + } + } + else { + throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); + } + + + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return Integer.parseInt( str ); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Dinuc.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Dinuc.java new file mode 100755 index 000000000..2d1fc592e --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/Dinuc.java @@ -0,0 +1,72 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 16, 2009 + */ +//public class Dinuc implements Comparable{ +public class Dinuc implements Comparable{ + private byte first; + private byte second; + + public Dinuc() { + first = 0; + second = 0; + } + + public Dinuc(final byte _first, final byte _second) { + first = _first; + second = _second; + } + + public final void setValues(final byte _first, final byte _second) { + first = _first; + second = _second; + } + + public int compareTo(final org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Dinuc that) { + if( this.first > that.first ) { return 1; } + else if( this.first < that.first ) { return -1; } + else { //this.first equals that.first + if( this.second > that.second ) { return 1; } + else if( this.second < that.second ) { return -1; } + else { return 0; } + } + + } + + public static int hashBytes(final byte byte1, final byte byte2) { + return byte1 << 8 + byte2; + } + + public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file + byte[] byteArray = {first,second}; + return new String(byteArray); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/HomopolymerCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/HomopolymerCovariate.java new file mode 100755 index 000000000..3b821b0e1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/HomopolymerCovariate.java @@ -0,0 +1,141 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import net.sf.samtools.SAMRecord; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.ExperimentalCovariate; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 4, 2009 + * + * The Homopolymer Run Covariate. This is the number of consecutive bases in the previous N that match the current base. + * For example, if N = 10: + * ATTGCCCCGTAAAAAAAAATA + * 001001230001234567800 + */ + +public class HomopolymerCovariate implements ExperimentalCovariate { + + int numBack = 7; + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + numBack = RAC.HOMOPOLYMER_NBACK; + } + + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read, final int offset ) { + + // This block of code is for if you don't want to only count consecutive bases + // ATTGCCCCGTAAAAAAAAATA + // 001001231211234567819 + /* + int numAgree = 0; // The number of bases that agree with you in the previous numBack bases of the read + int startPos = 0; + int stopPos = 0; + byte[] bases = read.getReadBases(); + byte thisBase = bases[offset]; + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + startPos = Math.max(offset - numBack, 0); + stopPos = Math.max(offset - 1, 0); + } else { // Negative direction + startPos = Math.min(offset + 2, bases.length); + stopPos = Math.min(offset + numBack + 1, bases.length); + } + + for( int iii = startPos; iii < stopPos; iii++ ) { + if( bases[iii] == thisBase ) { numAgree++; } + } + */ + + int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read + final byte[] bases = read.getReadBases(); + int iii = offset; + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + while( iii <= bases.length-2 && bases[iii] == bases[iii+1] && numAgree < numBack ) { + numAgree++; + iii++; + } + } else { // Negative direction + while( iii >= 1 && bases[iii] == bases[iii-1] && numAgree < numBack ) { + numAgree++; + iii--; + } + } + + return numAgree; + } + + private void getContextHomopolymerLength(final byte[] refBytes, Comparable[] hrunArray) { + // compute forward hrun length, example: + // AGGTGACCCCCCTGAGAG + // 001000012345000000 + int runCount = 0; + hrunArray[0] = 0; + int[] hforward = new int[hrunArray.length]; + int[] hreverse = new int[hrunArray.length]; + + for (int i = 1; i < refBytes.length; i++) { + if (refBytes[i] == refBytes[i-1]) + hforward[i] = hforward[i-1]+1; + else + hforward[i] = 0; + } + + // do similar thing for reverse length, example: + // AGGTGACCCCCCTGAGAG + // 021000543210000000 + // and then accumulate with forward values. + // Total: + // AGGTGACCCCCCTGAGAG + // 022000555555000000 + for (int i=refBytes.length-1; i > 0; i--) { + if (refBytes[i-1] == refBytes[i]) + hreverse[i-1] += hreverse[i]+1; + } + + for (int i = 1; i < refBytes.length; i++) + hrunArray[i] = hforward[i]+hreverse[i]; + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + + // getContextHomopolymerLength(read.getReadBases(), comparable); + for(int iii = 0; iii < read.getReadLength(); iii++) { + comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized + } + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return Integer.parseInt( str ); + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariate.java new file mode 100755 index 000000000..4e7bc06e4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariate.java @@ -0,0 +1,113 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.ExperimentalCovariate; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.BaseUtils; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * The Reported Quality Score covariate. + */ + +public class IndelCountCovariate implements ExperimentalCovariate { + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + } + + + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read ) { + + Cigar c = read.getCigar(); + + int indelCount = 0; + + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + switch( ce.getOperator() ) { + case D: + indelCount++; + break; + case I: + indelCount++; + break; + case M: + case N: + case S: + default: + break; + } + } + + return indelCount; + } + + public void getValues(SAMRecord read, Comparable[] comparable) { +/* Comparable numIndels = getValue(read); + for(int iii = 0; iii < read.getReadLength(); iii++) { + comparable[iii] = numIndels; // BUGBUG: this can be optimized + } */ + int ind = 0; + Cigar c = read.getCigar(); + System.out.println(c.toString()); + for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { + CigarElement ce = c.getCigarElement(i); + + switch( ce.getOperator() ) { + case D: + case I: + for (int k=0; k < ce.getLength(); k++) + comparable[ind++] = ce.getLength(); + break; + case M: + case N: + case S: + for (int k=0; k < ce.getLength(); k++) + comparable[ind++] = 0; + default: + break; + } + } + } + + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return Integer.parseInt( str ); + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariatesWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariatesWalker.java new file mode 100755 index 000000000..3e983bb3c --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelCountCovariatesWalker.java @@ -0,0 +1,637 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import org.broad.tribble.bed.BEDCodec; +import org.broad.tribble.dbsnp.DbSNPCodec; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFCodec; +import org.broadinstitute.sting.commandline.Gather; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +//import org.broadinstitute.sting.gatk.walkers.recalibration.*; +//import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * This walker is designed to work as the first pass in a two-pass processing step. + * It does a by-locus traversal operating only at sites that are not in dbSNP. + * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality. + * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide) + * Since there is a large amount of data one can then calculate an empirical probability of error + * given the particular covariates seen at this site, where p(error) = num mismatches / num observations + * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score) + * The first non-comment line of the output file gives the name of the covariates that were used for this calculation. + * + * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified + * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. + * + * @author rpoplin + * @since Nov 3, 2009 + * @help.summary First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). + */ + +@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file +@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality +@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta +@PartitionBy(PartitionType.LOCUS) +// todo - merge with CountCovariates, all work is done, just need to port over +public class IndelCountCovariatesWalker extends LocusWalker implements TreeReducible { + + ///////////////////////////// + // Constants + ///////////////////////////// + private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped. + private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed. + private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord. + + ///////////////////////////// + // Shared Arguments + ///////////////////////////// + @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + @Output + PrintStream out; + + ///////////////////////////// + // Command Line Arguments + ///////////////////////////// + @Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file") + @Gather(CountCovariatesGatherer.class) + public PrintStream RECAL_FILE; + + @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) + private boolean LIST_ONLY = false; + @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) + private String[] COVARIATES = null; + @Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false) + private boolean USE_STANDARD_COVARIATES = false; + + @Argument(fullName="count_indels", shortName="indels", doc="Count covariates at indel sites", required=false) + private boolean COUNT_INDELS = false; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + @Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.") + private boolean DONT_SORT_OUTPUT = false; + @Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") + private boolean RUN_WITHOUT_DBSNP = false; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private final ArrayList requestedCovariates = new ArrayList(); // A list to hold the covariate objects that were requested + private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) + private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) + + public static class CountedData { + private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file + private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file + private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file + private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base + private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. + + private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci + private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci + private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation + + /** + * Adds the values of other to this, returning this + * @param other + * @return this object + */ + public CountedData add(CountedData other) { + countedSites += other.countedSites; + countedBases += other.countedBases; + skippedSites += other.skippedSites; + solidInsertedReferenceBases += other.solidInsertedReferenceBases; + otherColorSpaceInconsistency += other.otherColorSpaceInconsistency; + dbSNPCountsMM += other.dbSNPCountsMM; + dbSNPCountsBases += other.dbSNPCountsBases; + novelCountsMM += other.novelCountsMM; + novelCountsBases += other.novelCountsBases; + lociSinceLastDbsnpCheck += other.lociSinceLastDbsnpCheck; + return this; + } + } + + // enable deletions in the pileup + public boolean includeReadsWithDeletionAtLoci() { return COUNT_INDELS; } + + // enable extended events for indels + public boolean generateExtendedEvents() { return COUNT_INDELS; } + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Parse the -cov arguments and create a list of covariates to be used here + * Based on the covariates' estimates for initial capacity allocate the data hashmap + */ + public void initialize() { + + if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } + if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } + + // Get a list of all available covariates + final List> covariateClasses = new PluginManager( Covariate.class ).getPlugins(); + final List> requiredClasses = new PluginManager( RequiredCovariate.class ).getPlugins(); + final List> standardClasses = new PluginManager( StandardCovariate.class ).getPlugins(); + + // Print and exit if that's what was requested + if ( LIST_ONLY ) { + out.println( "Available covariates:" ); + for( Class covClass : covariateClasses ) { + out.println( covClass.getSimpleName() ); + } + out.println(); + + System.exit( 0 ); // Early exit here because user requested it + } + + // Warn the user if no dbSNP file or other variant mask was specified + boolean foundDBSNP = false; + for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { + if( rod != null ) { + if( rod.getType().equals(DbSNPCodec.class) || + rod.getType().equals(VCFCodec.class) || + rod.getType().equals(BEDCodec.class) ) { + foundDBSNP = true; + break; + } + } + } + if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) { + throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation."); + } + + // Initialize the requested covariates by parsing the -cov argument + // First add the required covariates + if( requiredClasses.size() == 2) { // readGroup and reported quality score + requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here + requestedCovariates.add( new QualityScoreCovariate() ); + } else { + throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); + } + // Next add the standard covariates if -standard was specified by the user + if( USE_STANDARD_COVARIATES ) { + // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order + // A list of Classes can't be sorted, but a list of Class names can be + final List standardClassNames = new ArrayList(); + for( Class covClass : standardClasses ) { + standardClassNames.add( covClass.getName() ); + } + Collections.sort(standardClassNames); // Sort the list of class names + for( String className : standardClassNames ) { + for( Class covClass : standardClasses ) { // Find the class that matches this class name + if( covClass.getName().equals( className ) ) { + try { + final Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } + } + } + // Finally parse the -cov arguments that were provided, skipping over the ones already specified + if( COVARIATES != null ) { + for( String requestedCovariateString : COVARIATES ) { + boolean foundClass = false; + for( Class covClass : covariateClasses ) { + if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + foundClass = true; + if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { + try { + // Now that we've found a matching class, try to instantiate it + final Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } + } + + if( !foundClass ) { + throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); + } + } + } + + logger.info( "The covariates being used here: " ); + for( Covariate cov : requestedCovariates ) { + logger.info( "\t" + cov.getClass().getSimpleName() ); + cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection + } + +// try { +// stream = new PrintStream( RAC.RECAL_FILE ); +// } catch ( FileNotFoundException e ) { +// throw new RuntimeException( "Couldn't open output file: ", e ); +// } + } + + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * For each read at this locus get the various covariate values and increment that location in the map based on + * whether or not the base matches the reference at this particular location + * @param tracker The reference metadata tracker + * @param ref The reference context + * @param context The alignment context + * @return Returns 1, but this value isn't used in the reduce step + */ + public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + // Pull out data for this locus for all the input RODs and check if this is a known variant site in any of them + boolean isKnownVariant = false; + for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) { + if( vc != null ) { + isKnownVariant = true; + break; + } + } + + // Only use data from non-dbsnp sites + // Assume every mismatch at a non-dbsnp site is indicative of poor quality + CountedData counter = new CountedData(); + if( !isKnownVariant ) { + + if (COUNT_INDELS && context.hasExtendedEventPileup()) + { + + for ( ExtendedEventPileupElement p : context.getExtendedEventPileup().toExtendedIterable() ) { + + GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead(); + parsePileupElement(gatkRead, p.getOffset(), ref, counter, RAC, p.getType(), true); + + } + } + else { + // For each read at this locus + for( PileupElement p : context.getBasePileup() ) { + GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead(); + parsePileupElement(gatkRead, p.getOffset(), ref, counter, RAC, ExtendedEventPileupElement.Type.NOEVENT, false); + } + } + counter.countedSites++; + } else { // We skipped over the dbSNP site, and we are only processing every Nth locus + counter.skippedSites++; + updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable + } + + return counter; + } + + private void parsePileupElement(GATKSAMRecord gatkRead, int offset, ReferenceContext ref, CountedData counter, + RecalibrationArgumentCollection RAC, ExtendedEventPileupElement.Type type, boolean inExtendedPileup) { + + + if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { + return; + } + + if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) + { + gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true ); + RecalDataManager.parseSAMRecord( gatkRead, RAC ); + + // Skip over reads with no calls in the color space if the user requested it + if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) { + gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true); + return; + } + + RecalDataManager.parseColorSpace( gatkRead ); + gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, + RecalDataManager.computeCovariates( gatkRead, requestedCovariates )); + } + + // Skip this position if base quality is zero + boolean hasIndelAtThisPosition = false; + if (inExtendedPileup) { + // only count indel events in extended pileups + if (type.equals(ExtendedEventPileupElement.Type.NOEVENT)) + return; + + else + hasIndelAtThisPosition = true; + } else { + // in regular pileups we still get del bases: ignore now + if (offset < 0) + return; + } + + + if (COUNT_INDELS) { + if (offset < 0) { + // recompute offset in case of a deletion + offset = ref.getLocus().getStart() - gatkRead.getUnclippedStart(); + // further edge case: read starting w/insertion: ignore for now + if (offset < 0) return; + } + updateDataFromRead( counter, gatkRead, offset, ref.getBase(), hasIndelAtThisPosition ); + + } + else { + // Skip this position if base quality is zero + if( gatkRead.getBaseQualities()[offset] > 0 ) { + + byte[] bases = gatkRead.getReadBases(); + byte refBase = ref.getBase(); + + // Skip if this base is an 'N' or etc. + if( BaseUtils.isRegularBase( bases[offset] ) ) { + + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || + !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { + + // This base finally passed all the checks for a good base, so add it to the big data hashmap + updateDataFromRead( counter, gatkRead, offset, refBase, hasIndelAtThisPosition ); + + } else { // calculate SOLID reference insertion rate + if( refBase == bases[offset] ) { + counter.solidInsertedReferenceBases++; + } else { + counter.otherColorSpaceInconsistency++; + } + } + } + } + } + } + + + /** + * Update the mismatch / total_base counts for a given class of loci. + * + * @param counter The CountedData to be updated + * @param context The AlignmentContext which holds the reads covered by this locus + * @param refBase The reference base + */ + private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) { + + if (context.hasExtendedEventPileup()){ + for( PileupElement p : context.getExtendedEventPileup() ) { + counter.novelCountsBases++; + } + return; + + } + + for( PileupElement p : context.getBasePileup() ) { + final byte readBase = p.getBase(); + final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase); + final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase); + + if( readBaseIndex != -1 && refBaseIndex != -1 ) { + if( readBaseIndex != refBaseIndex ) { + counter.novelCountsMM++; + } + counter.novelCountsBases++; + } + } + } + + /** + * Major workhorse routine for this walker. + * Loop through the list of requested covariates and pick out the value from the read, offset, and reference + * Using the list of covariate values as a key, pick out the RecalDatum and increment, + * adding one to the number of observations and potentially one to the number of mismatches + * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls + * because pulling things out of the SAMRecord is an expensive operation. + * @param counter Data structure which holds the counted bases + * @param gatkRead The SAMRecord holding all the data for this read + * @param offset The offset in the read for this locus + * @param refBase The reference base at this locus + */ + private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase, + final boolean hasIndelAtThisPosition) { + final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); + + if (offset < 0) { + int k=0; + } + final Object[] key = covars[offset]; + + // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + final NestedHashMap data = dataManager.data; //optimization - create local reference + RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key ); + if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it + // initialized with zeros, will be incremented at end of method + datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key ); + } + + // Need the bases to determine whether or not we have a mismatch + final byte base = gatkRead.getReadBases()[offset]; + final long curMismatches = datum.getNumMismatches(); + + // Add one to the number of observations and potentially one to the number of mismatches + if (COUNT_INDELS) + datum.incrementBaseCounts(hasIndelAtThisPosition); + else + datum.incrementBaseCounts( base, refBase ); + + counter.countedBases++; + counter.novelCountsBases++; + counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable + } + + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker. + * @return returns A PrintStream created from the -recalFile filename argument specified to the walker + */ + public CountedData reduceInit() { + return new CountedData(); + } + + /** + * The Reduce method doesn't do anything for this walker. + * @param mapped Result of the map. This value is immediately ignored. + * @param sum The summing CountedData used to output the CSV data + * @return returns The sum used to output the CSV data + */ + public CountedData reduce( CountedData mapped, CountedData sum ) { + // Do a dbSNP sanity check every so often + return validatingDbsnpMismatchRate(sum.add(mapped)); + } + + /** + * Validate the dbSNP reference mismatch rates. + */ + private CountedData validatingDbsnpMismatchRate(CountedData counter) { + if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) { + counter.lociSinceLastDbsnpCheck = 0; + + if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) { + final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases; + final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases; + + if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); + DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file + } + } + } + + return counter; + } + + public CountedData treeReduce( CountedData sum1, CountedData sum2 ) { + return validatingDbsnpMismatchRate(sum1.add(sum2)); + } + + /** + * Write out the full data hashmap to disk in CSV format + * @param sum The CountedData to write out to RECAL_FILE + */ + public void onTraversalDone( CountedData sum ) { + logger.info( "Writing raw recalibration data..." ); + outputToCSV( sum, RECAL_FILE ); + logger.info( "...done!" ); + } + + /** + * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format + * @param recalTableStream The PrintStream to write out to + */ + private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) { + recalTableStream.printf("# Counted Sites %d%n", sum.countedSites); + recalTableStream.printf("# Counted Bases %d%n", sum.countedBases); + recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites); + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites); + + if( sum.solidInsertedReferenceBases != 0 ) { + recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases); + recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency); + } + + // Output header saying which covariates were used and in what order + for( Covariate cov : requestedCovariates ) { + recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," ); + } + + if (COUNT_INDELS) + recalTableStream.println("nObservations,nTrueIndels,Qempirical"); + else + recalTableStream.println("nObservations,nMismatches,Qempirical"); + + if( DONT_SORT_OUTPUT ) { + printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); + } else { + printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); + } + + // print out an EOF marker + recalTableStream.println(TableRecalibrationWalker.EOF_MARKER); + } + + private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { + final ArrayList keyList = new ArrayList(); + for( Object comp : data.keySet() ) { + keyList.add((Comparable) comp); + } + + Collections.sort(keyList); + + for( Comparable comp : keyList ) { + key[curPos] = comp; + final Object val = data.get(comp); + if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + // For each Covariate in the key + for( Object compToPrint : key ) { + // Output the Covariate's value + recalTableStream.print( compToPrint + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); + } else { // Another layer in the nested hash map + printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val ); + } + } + } + + private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { + for( Object comp : data.keySet() ) { + key[curPos] = comp; + final Object val = data.get(comp); + if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + // For each Covariate in the key + for( Object compToPrint : key ) { + // Output the Covariate's value + recalTableStream.print( compToPrint + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); + } else { // Another layer in the nested hash map + printMappings( recalTableStream, curPos + 1, key, (Map) val ); + } + } + } +} + diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelPositionCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelPositionCovariate.java new file mode 100755 index 000000000..571ff88e2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelPositionCovariate.java @@ -0,0 +1,39 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import net.sf.samtools.SAMRecord; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Jan 17, 2011 + * Time: 2:53:28 PM + * To change this template use File | Settings | File Templates. + */ +public class IndelPositionCovariate implements Covariate { + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + } + + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read, final int offset ) { + int cycle = offset; + if( read.getReadNegativeStrandFlag() ) { + cycle = read.getReadLength() - (offset + 1); + } + return cycle; + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return Integer.parseInt( str ); + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + for(int iii = 0; iii < read.getReadLength(); iii++) { + comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelTableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelTableRecalibrationWalker.java new file mode 100755 index 000000000..d5d8de048 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/IndelTableRecalibrationWalker.java @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; +import java.util.regex.Pattern; + +import net.sf.samtools.*; +import net.sf.samtools.util.SequenceUtil; + +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.collections.NestedHashMap; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. + + * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) + * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. + * This walker then outputs a new bam file with these updated (recalibrated) reads. + * + * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. + * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. + * + * @author rpoplin + * @since Nov 3, 2009 + * @help.summary Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. + */ + +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@WalkerName("TableRecalibration") +@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta +public class IndelTableRecalibrationWalker extends ReadWalker { + + public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration"; + + ///////////////////////////// + // Shared Arguments + ///////////////////////////// + @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + @Input(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file") + public File RECAL_FILE = new File("output.recal_data.csv"); + + ///////////////////////////// + // Command Line Arguments + ///////////////////////////// + @Argument(fullName="output_bam", shortName="outputBam", doc="Please use --out instead", required=false) + @Deprecated + protected String outbam; + + @Output(doc="The output BAM file", required=true) + private StingSAMFileWriter OUTPUT_BAM = null; + @Argument(fullName="preserve_qscores_less_than", shortName="pQ", + doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) + private int PRESERVE_QSCORES_LESS_THAN = 5; + @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1") + private int SMOOTHING = 1; + @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50") + private int MAX_QUALITY_SCORE = 50; + @Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read") + private boolean DO_NOT_WRITE_OQ = false; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + @Hidden + @Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + private boolean NO_PG_TAG = false; + @Hidden + @Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.") + private boolean REQUIRE_EOF = false; + @Hidden + @Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations") + private boolean skipUQUpdate = false; + + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation + private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + public static final String EOF_MARKER = "EOF"; + private long numReadsWithMalformedColorSpace = 0; + + ///////////////////////////// + // Optimization + ///////////////////////////// + private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. + + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Read in the recalibration table input file. + * Parse the list of covariate classes used during CovariateCounterWalker. + * Parse the CSV data and populate the hashmap. + */ + public void initialize() { + + if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } + if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } + + // Get a list of all available covariates + final List> classes = new PluginManager(Covariate.class).getPlugins(); + + int lineNumber = 0; + boolean foundAllCovariates = false; + + // Warn the user if a dbSNP file was specified since it isn't being used here + boolean foundDBSNP = false; + for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { + if( rod.getName().equalsIgnoreCase(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + foundDBSNP = true; + } + } + if( foundDBSNP ) { + Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it."); + } + + // Read in the data from the csv file and populate the data map and covariates list + logger.info( "Reading in the data from input csv file..." ); + + boolean sawEOF = false; + try { + for ( String line : new XReadLines(RECAL_FILE) ) { + lineNumber++; + if ( EOF_MARKER.equals(line) ) { + sawEOF = true; + } else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { + ; // Skip over the comment lines, (which start with '#') + } + // Read in the covariates that were used from the input file + else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data + if( foundAllCovariates ) { + throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); + } else { // Found the covariate list in input file, loop through all of them and instantiate them + String[] vals = line.split(","); + for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical + boolean foundClass = false; + for( Class covClass : classes ) { + if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + foundClass = true; + try { + Covariate covariate = (Covariate)covClass.newInstance(); + requestedCovariates.add( covariate ); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + + } + } + + if( !foundClass ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); + } + } + } + + } else { // Found a line of data + if( !foundAllCovariates ) { + foundAllCovariates = true; + + // At this point all the covariates should have been found and initialized + if( requestedCovariates.size() < 2 ) { + throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); + } + + final boolean createCollapsedTables = true; + + // Initialize any covariate member variables using the shared argument collection + for( Covariate cov : requestedCovariates ) { + cov.initialize( RAC ); + } + // Initialize the data hashMaps + dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); + + } + addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap + } + } + + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); + } catch ( NumberFormatException e ) { + throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); + } + logger.info( "...done!" ); + + if ( !sawEOF ) { + final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; + if ( REQUIRE_EOF ) + throw new UserException.MalformedFile(RECAL_FILE, errorMessage); + logger.warn(errorMessage); + } + + logger.info( "The covariates being used here: " ); + for( Covariate cov : requestedCovariates ) { + logger.info( "\t" + cov.getClass().getSimpleName() ); + } + + if( dataManager == null ) { + throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); + } + + // Create the tables of empirical quality scores that will be used in the sequential calculation + logger.info( "Generating tables of empirical qualities for use in sequential calculation..." ); + dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE ); + logger.info( "...done!" ); + + // Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used + final SAMFileHeader header = getToolkit().getSAMFileHeader().clone(); + if( !NO_PG_TAG ) { + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + try { + final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); + programRecord.setProgramVersion(version); + } catch (MissingResourceException e) {} + + StringBuffer sb = new StringBuffer(); + sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + sb.append(" Covariates=["); + for( Covariate cov : requestedCovariates ) { + sb.append(cov.getClass().getSimpleName()); + sb.append(", "); + } + sb.setCharAt(sb.length()-2, ']'); + sb.setCharAt(sb.length()-1, ' '); + programRecord.setCommandLine(sb.toString()); + + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + + // Write out the new header + OUTPUT_BAM.writeHeader( header ); + } + } + + /** + * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * @param line A line of CSV data read from the recalibration table data file + */ + private void addCSVData(final File file, final String line) { + final String[] vals = line.split(","); + + // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly + if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); + } + + final Object[] key = new Object[requestedCovariates.size()]; + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + key[iii] = cov.getValue( vals[iii] ); + } + + // Create a new datum using the number of observations, number of mismatches, and reported quality score + final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); + // Add that datum to all the collapsed tables which will be used in the sequential calculation + dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read + * @param refBases References bases over the length of the read + * @param read The read to be recalibrated + * @return The read with quality scores replaced + */ + public SAMRecord map( ReferenceContext refBases, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { + + if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads. + return read; + } + + RecalDataManager.parseSAMRecord( read, RAC ); + + byte[] originalQuals = read.getBaseQualities(); + final byte[] recalQuals = originalQuals.clone(); + + final String platform = read.getReadGroup().getPlatform(); + if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) { + if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) { + final boolean badColor = RecalDataManager.checkNoCallColorSpace( read ); + if( badColor ) { + numReadsWithMalformedColorSpace++; + if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) { + return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them + } else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { + read.setReadFailsVendorQualityCheckFlag(true); + return read; + } + } + } + originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() ); + } + + //compute all covariate values for this read + final Comparable[][] covariateValues_offset_x_covar = + RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); + + // For each base in the read + for( int offset = 0; offset < read.getReadLength(); offset++ ) { + + final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; + + Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); + if(qualityScore == null) + { + qualityScore = performSequentialQualityCalculation( fullCovariateKey ); + qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); + } + + recalQuals[offset] = qualityScore; + } + + preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low + + read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities + if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read + read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals)); + } + + if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) { + read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false)); + } + + if (RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N && refBases != null && read.getAttribute(SAMTag.NM.name()) != null) { + read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, refBases.getBases(), read.getAlignmentStart() - 1, false)); + } + + return read; + } + + /** + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte + */ + private byte performSequentialQualityCalculation( final Object... key ) { + + final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + + // The global quality shift (over the read group only) + readGroupCollapsedKey[0] = key[0]; + final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); + double globalDeltaQ = 0.0; + if( globalRecalDatum != null ) { + final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); + final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; + } + + // The shift in quality between reported and empirical + qualityScoreCollapsedKey[0] = key[0]; + qualityScoreCollapsedKey[1] = key[1]; + final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); + double deltaQReported = 0.0; + if( qReportedRecalDatum != null ) { + final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + double deltaQCovariateEmpirical; + covariateCollapsedKey[0] = key[0]; + covariateCollapsedKey[1] = key[1]; + for( int iii = 2; iii < key.length; iii++ ) { + covariateCollapsedKey[2] = key[iii]; // The given covariate + final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); + if( covariateRecalDatum != null ) { + deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + } + } + + final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + //return newQualityByte; + } + + /** + * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold + * @param originalQuals The list of original base quality scores + * @param recalQuals A list of the new recalibrated quality scores + */ + private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { + for( int iii = 0; iii < recalQuals.length; iii++ ) { + if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) { + recalQuals[iii] = originalQuals[iii]; + } + } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Start the reduce with a handle to the output bam file + * @return A FileWriter pointing to a new bam file + */ + public SAMFileWriter reduceInit() { + return OUTPUT_BAM; + } + + /** + * Output each read to disk + * @param read The read to output + * @param output The FileWriter to write the read to + * @return The FileWriter + */ + public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { + if( output != null ) { + output.addAlignment(read); + } + return output; + } + + /** + * Do nothing + * @param output The SAMFileWriter that outputs the bam file + */ + public void onTraversalDone(SAMFileWriter output) { + if( numReadsWithMalformedColorSpace != 0 ) { + if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) { + Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + + "These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!"); + } else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { + Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + + "These reads were completely removed from the output bam file."); + + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/QualityScoreCovariate.java new file mode 100755 index 000000000..a2b479d40 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/QualityScoreCovariate.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import net.sf.samtools.SAMRecord; +//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * The Reported Quality Score covariate. + */ + +public class QualityScoreCovariate implements RequiredCovariate { + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + } + + /* + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read, final int offset ) { + return (int)(read.getBaseQualities()[offset]); + } + */ + + public void getValues(SAMRecord read, Comparable[] comparable) { + byte[] baseQualities = read.getBaseQualities(); + for(int i = 0; i < read.getReadLength(); i++) { +// comparable[i] = (int) baseQualities[i]; + comparable[i] = (int) 45; + } + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return Integer.parseInt( str ); + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/ReadGroupCovariate.java new file mode 100755 index 000000000..f250523cb --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/ReadGroupCovariate.java @@ -0,0 +1,67 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import net.sf.samtools.SAMRecord; +//import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 30, 2009 + * + * The Read Group covariate. + */ + +public class ReadGroupCovariate implements RequiredCovariate { + + public static final String defaultReadGroup = "DefaultReadGroup"; + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + } + + /* + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read, final int offset ) { + return read.getReadGroup().getReadGroupId(); + } + */ + + public void getValues(SAMRecord read, Comparable[] comparable) { + final String readGroupId = read.getReadGroup().getReadGroupId(); + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = readGroupId; + } + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return str; + } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDataManager.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDataManager.java new file mode 100755 index 000000000..6917089cc --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDataManager.java @@ -0,0 +1,649 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.NestedHashMap; + +import java.util.*; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMUtils; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 6, 2009 + * + * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. + * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. + * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration. + */ + +public class RecalDataManager { + + public final NestedHashMap data; // The full dataset + private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private final ArrayList dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams + public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private static boolean warnUserNullReadGroup = false; + private static boolean warnUserNullPlatform = false; + + public enum SOLID_RECAL_MODE { + DO_NOTHING, + SET_Q_ZERO, + SET_Q_ZERO_BASE_N, + REMOVE_REF_BIAS + } + + public enum SOLID_NOCALL_STRATEGY { + THROW_EXCEPTION, + LEAVE_READ_UNRECALIBRATED, + PURGE_READ + } + + public RecalDataManager() { + data = new NestedHashMap(); + dataCollapsedReadGroup = null; + dataCollapsedQualityScore = null; + dataCollapsedByCovariate = null; + } + + public RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { + if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + data = null; + dataCollapsedReadGroup = new NestedHashMap(); + dataCollapsedQualityScore = new NestedHashMap(); + dataCollapsedByCovariate = new ArrayList(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.add( new NestedHashMap() ); + } + } else { + data = new NestedHashMap(); + dataCollapsedReadGroup = null; + dataCollapsedQualityScore = null; + dataCollapsedByCovariate = null; + } + } + + /** + * Add the given mapping to all of the collapsed hash tables + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping + * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table + */ + public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) { + + // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around + //data.put(key, thisDatum); // add the mapping to the main table + + final int qualityScore = Integer.parseInt( key[1].toString() ); + final Object[] readGroupCollapsedKey = new Object[1]; + final Object[] qualityScoreCollapsedKey = new Object[2]; + final Object[] covariateCollapsedKey = new Object[3]; + RecalDatum collapsedDatum; + + // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed + if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) { + readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group + collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey ); + if( collapsedDatum == null ) { + dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey ); + } else { + collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported + } + } + + // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... + qualityScoreCollapsedKey[1] = key[1]; // and quality score + collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey ); + if( collapsedDatum == null ) { + dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey ); + } else { + collapsedDatum.increment( fullDatum ); + } + + // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { + covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... + covariateCollapsedKey[1] = key[1]; // and quality score ... + final Object theCovariateElement = key[iii + 2]; // and the given covariate + if( theCovariateElement != null ) { + covariateCollapsedKey[2] = theCovariateElement; + collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey ); + if( collapsedDatum == null ) { + dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey ); + } else { + collapsedDatum.increment( fullDatum ); + } + } + } + } + + /** + * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score + * that will be used in the sequential calculation in TableRecalibrationWalker + * @param smoothing The smoothing parameter that goes into empirical quality score calculation + * @param maxQual At which value to cap the quality scores + */ + public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { + + recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual); + recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual); + for( NestedHashMap map : dataCollapsedByCovariate ) { + recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); + checkForSingletons(map.data); + } + } + + private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) { + + for( Object comp : data.keySet() ) { + final Object val = data.get(comp); + if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps + ((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual); + } else { // Another layer in the nested hash map + recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual); + } + } + } + + private void checkForSingletons( final Map data ) { + // todo -- this looks like it's better just as a data.valueSet() call? + for( Object comp : data.keySet() ) { + final Object val = data.get(comp); + if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps + if( data.keySet().size() == 1) { + data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ... + // in a previous step of the sequential calculation model + } + } else { // Another layer in the nested hash map + checkForSingletons( (Map) val ); + } + } + } + + /** + * Get the appropriate collapsed table out of the set of all the tables held by this Object + * @param covariate Which covariate indexes the desired collapsed HashMap + * @return The desired collapsed HashMap + */ + public final NestedHashMap getCollapsedTable( final int covariate ) { + if( covariate == 0) { + return dataCollapsedReadGroup; // Table where everything except read group has been collapsed + } else if( covariate == 1 ) { + return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + } else { + return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed + } + } + + /** + * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * @param read The read to adjust + * @param RAC The list of shared command line arguments + */ + public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { + + SAMReadGroupRecord readGroup = read.getReadGroup(); + + // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments + if( readGroup == null ) { + if( RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { + if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { + Utils.warnUser("The input .bam file contains reads with no read group. " + + "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName() ); + warnUserNullReadGroup = true; + } + // There is no readGroup so defaulting to these values + readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); + readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); + ((GATKSAMRecord)read).setReadGroup( readGroup ); + } else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() + + " Users must set both the default read group using the --default_read_group argument and the default platform using the --default_platform argument." ); + } + } + + if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user + final String oldPlatform = readGroup.getPlatform(); + readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP ); + readGroup.setPlatform( oldPlatform ); + ((GATKSAMRecord)read).setReadGroup( readGroup ); + } + + if( RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform( RAC.FORCE_PLATFORM ); + } + + if ( readGroup.getPlatform() == null ) { + if( RAC.DEFAULT_PLATFORM != null ) { + if( !warnUserNullPlatform ) { + Utils.warnUser("The input .bam file contains reads with no platform information. " + + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName() ); + warnUserNullPlatform = true; + } + readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); + } else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() + + " Users must set the default platform using the --default_platform argument." ); + } + } + } + + /** + * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space + * @param read The SAMRecord to parse + */ + public static void parseColorSpace( final SAMRecord read ) { + + // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { + if( read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read + final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + byte[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).getBytes(); + } else { + throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); + if( read.getReadNegativeStrandFlag() ) { + readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); + } + final byte[] inconsistency = new byte[readBases.length]; + int iii; + byte prevBase = colorSpace[0]; // The sentinel + for( iii = 0; iii < readBases.length; iii++ ) { + final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); + inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 ); + prevBase = readBases[iii]; + } + read.setAttribute( org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency ); + + } else { + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + } + } + } + } + + /** + * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases + * This method doesn't add the inconsistent tag to the read like parseColorSpace does + * @param read The SAMRecord to parse + * @param originalQualScores The array of original quality scores to modify during the correction + * @param solidRecalMode Which mode of solid recalibration to apply + * @param refBases The reference for this read + * @return A new array of quality scores that have been ref bias corrected + */ + public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases ) { + + final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + byte[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).getBytes(); + } else { + throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); + final byte[] colorImpliedBases = readBases.clone(); + byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray( read.getCigar(), read.getReadBases(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases + if( read.getReadNegativeStrandFlag() ) { + readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); + refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() ); + } + final int[] inconsistency = new int[readBases.length]; + byte prevBase = colorSpace[0]; // The sentinel + for( int iii = 0; iii < readBases.length; iii++ ) { + final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); + colorImpliedBases[iii] = thisBase; + inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 ); + prevBase = readBases[iii]; + } + + // Now that we have the inconsistency array apply the desired correction to the inconsistent bases + if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO ) { // Set inconsistent bases and the one before it to Q0 + final boolean setBaseN = false; + originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); + } else if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N ) { + final boolean setBaseN = true; + originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); + } else if( solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases + solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); + } + + } else { + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + } + + return originalQualScores; + } + + public static boolean checkNoCallColorSpace( final SAMRecord read ) { + if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { + final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + byte[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).substring(1).getBytes(); // trim off the Sentinel + } else { + throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + for( byte color : colorSpace ) { + if( color != (byte)'0' && color != (byte)'1' && color != (byte)'2' && color != (byte)'3' ) { + return true; // There is a bad color in this SOLiD read and the user wants to skip over it + } + } + + } else { + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + } + } + + return false; // There aren't any color no calls in this SOLiD read + } + + /** + * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * @param originalQualScores The array of original quality scores to set to zero if needed + * @param refBases The reference which has been RC'd if necessary + * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar + * @return The byte array of original quality scores some of which might have been set to zero + */ + private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, + final byte[] refBases, final boolean setBaseN ) { + + final boolean negStrand = read.getReadNegativeStrandFlag(); + for( int iii = 1; iii < originalQualScores.length; iii++ ) { + if( inconsistency[iii] == 1 ) { + if( readBases[iii] == refBases[iii] ) { + if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; } + else { originalQualScores[iii] = (byte)0; } + if( setBaseN ) { readBases[iii] = (byte)'N'; } + } + // Set the prev base to Q0 as well + if( readBases[iii-1] == refBases[iii-1] ) { + if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; } + else { originalQualScores[iii-1] = (byte)0; } + if( setBaseN ) { readBases[iii-1] = (byte)'N'; } + } + } + } + if( negStrand ) { + readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read + } + read.setReadBases( readBases ); + + return originalQualScores; + } + + /** + * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * @param colorImpliedBases The bases implied by the color space, RC'd if necessary + * @param refBases The reference which has been RC'd if necessary + */ + private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, + final byte[] refBases) { + + final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); + if( attr != null ) { + byte[] colorSpaceQuals; + if( attr instanceof String ) { + String x = (String)attr; + colorSpaceQuals = x.getBytes(); + SAMUtils.fastqToPhred(colorSpaceQuals); + } else { + throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + + for( int iii = 1; iii < inconsistency.length - 1; iii++ ) { + if( inconsistency[iii] == 1 ) { + for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read + if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step + if( readBases[jjj] == refBases[jjj] ) { + if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( 2 ); + if( rand == 0 ) { // The color implied base won the coin flip + readBases[jjj] = colorImpliedBases[jjj]; + } + } else { + final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); + final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); + int diffInQuality = maxQuality - minQuality; + int numLow = minQuality; + if( numLow == 0 ) { + numLow++; + diffInQuality++; + } + final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( numLow + numHigh ); + if( rand >= numLow ) { // higher q score won + if( maxQuality == (int)colorSpaceQuals[jjj] ) { + readBases[jjj] = colorImpliedBases[jjj]; + } // else ref color had higher q score, and won out, so nothing to do here + } else { // lower q score won + if( minQuality == (int)colorSpaceQuals[jjj] ) { + readBases[jjj] = colorImpliedBases[jjj]; + } // else ref color had lower q score, and won out, so nothing to do here + } + } + } + } + } + } + } + + if( read.getReadNegativeStrandFlag() ) { + readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read + } + read.setReadBases( readBases ); + } else { // No color space quality tag in file + throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); + } + } + + /** + * Given the base and the color calculate the next base in the sequence + * @param prevBase The base + * @param color The color + * @return The next base in the sequence + */ + private static byte getNextBaseFromColor( SAMRecord read, final byte prevBase, final byte color ) { + switch(color) { + case '0': + return prevBase; + case '1': + return performColorOne( prevBase ); + case '2': + return performColorTwo( prevBase ); + case '3': + return performColorThree( prevBase ); + default: + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char)color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + } + } + + /** + * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality + * @param read The read which contains the color space to check against + * @param offset The offset in the read at which to check + * @return Returns true if the base was inconsistent with the color space + */ + public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { + final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); + if( attr != null ) { + final byte[] inconsistency = (byte[])attr; + // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! + if( read.getReadNegativeStrandFlag() ) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] != (byte)0; + } else { // Forward direction + return inconsistency[offset] != (byte)0; + } + + // This block of code is for if you want to check both the offset and the next base for color space inconsistency + //if( read.getReadNegativeStrandFlag() ) { // Negative direction + // if( offset == 0 ) { + // return inconsistency[0] != 0; + // } else { + // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); + // } + //} else { // Forward direction + // if( offset == inconsistency.length - 1 ) { + // return inconsistency[inconsistency.length - 1] != 0; + // } else { + // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); + // } + //} + + } else { // No inconsistency array, so nothing is inconsistent + return false; + } + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * @param gatkRead The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return An array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + */ + public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { + //compute all covariates for this read + final List requestedCovariatesRef = requestedCovariates; + final int numRequestedCovariates = requestedCovariatesRef.size(); + final int readLength = gatkRead.getReadLength(); + + final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; + final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; + + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for( int i = 0; i < numRequestedCovariates; i++ ) { + requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder ); + for(int j = 0; j < readLength; j++) { + //copy values into a 2D array that allows all covar types to be extracted at once for + //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; + } + } + + return covariateValues_offset_x_covar; + } + + /** + * Perform a ceratin transversion (A <-> C or G <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transversion of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorOne(byte base) { + switch (base) { + case 'A': + case 'a': return 'C'; + case 'C': + case 'c': return 'A'; + case 'G': + case 'g': return 'T'; + case 'T': + case 't': return 'G'; + default: return base; + } + } + + /** + * Perform a transition (A <-> G or C <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transition of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorTwo(byte base) { + switch (base) { + case 'A': + case 'a': return 'G'; + case 'C': + case 'c': return 'T'; + case 'G': + case 'g': return 'A'; + case 'T': + case 't': return 'C'; + default: return base; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base. + * + * @param base the base [AaCcGgTt] + * @return the complementary base, or the input base if it's not one of the understood ones + */ + private static byte performColorThree(byte base) { + switch (base) { + case 'A': + case 'a': return 'T'; + case 'C': + case 'c': return 'G'; + case 'G': + case 'g': return 'C'; + case 'T': + case 't': return 'A'; + default: return base; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatum.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatum.java new file mode 100755 index 000000000..acefc71dc --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatum.java @@ -0,0 +1,112 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. + */ + +public class RecalDatum extends RecalDatumOptimized { + + private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations + private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + + public RecalDatum() { + numObservations = 0L; + numMismatches = 0L; + estimatedQReported = 0.0; + empiricalQuality = 0.0; + } + + public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality ) { + numObservations = _numObservations; + numMismatches = _numMismatches; + estimatedQReported = _estimatedQReported; + empiricalQuality = _empiricalQuality; + } + + public RecalDatum( final RecalDatum copy ) { + this.numObservations = copy.numObservations; + this.numMismatches = copy.numMismatches; + this.estimatedQReported = copy.estimatedQReported; + this.empiricalQuality = copy.empiricalQuality; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final void combine( final RecalDatum other ) { + final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); + this.increment( other.numObservations, other.numMismatches ); + this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations); + //if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // methods to derive empirical quality score + // + //--------------------------------------------------------------------------------------------------------------- + + public final void calcCombinedEmpiricalQuality( final int smoothing, final int maxQual ) { + this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again + } + + //--------------------------------------------------------------------------------------------------------------- + // + // misc. methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final double getEstimatedQReported() { + return estimatedQReported; + } + + public final double getEmpiricalQuality() { + return empiricalQuality; + } + + private double calcExpectedErrors() { + return (double)this.numObservations * qualToErrorProb( estimatedQReported ); + } + + private double qualToErrorProb( final double qual ) { + return Math.pow(10.0, qual / -10.0); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatumOptimized.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatumOptimized.java new file mode 100755 index 000000000..2a91d02c4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalDatumOptimized.java @@ -0,0 +1,139 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.List; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Jan 6, 2010 + * + * An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed. + * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. + */ + +public class RecalDatumOptimized { + + protected long numObservations; // number of bases seen in total + protected long numMismatches; // number of bases seen that didn't match the reference + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + + public RecalDatumOptimized() { + numObservations = 0L; + numMismatches = 0L; + } + + public RecalDatumOptimized( final long _numObservations, final long _numMismatches) { + numObservations = _numObservations; + numMismatches = _numMismatches; + } + + public RecalDatumOptimized( final RecalDatumOptimized copy ) { + this.numObservations = copy.numObservations; + this.numMismatches = copy.numMismatches; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + public synchronized final void increment( final long incObservations, final long incMismatches ) { + numObservations += incObservations; + numMismatches += incMismatches; + } + + public synchronized final void increment( final RecalDatumOptimized other ) { + increment( other.numObservations, other.numMismatches ); + } + + public synchronized final void increment( final List data ) { + for ( RecalDatumOptimized other : data ) { + this.increment( other ); + } + } + + public synchronized final void incrementBaseCounts( final byte curBase, final byte refBase ) { + increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches + } + + public synchronized final void incrementBaseCounts(boolean hasIndelAtThisPosition) { + increment(1, hasIndelAtThisPosition ? 1 : 0); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // methods to derive empirical quality score + // + //--------------------------------------------------------------------------------------------------------------- + + public final double empiricalQualDouble( final int smoothing, final double maxQual ) { + final double doubleMismatches = (double) ( numMismatches + smoothing ); + final double doubleObservations = (double) ( numObservations + smoothing ); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + if (empiricalQual > maxQual) { empiricalQual = maxQual; } + return empiricalQual; + } + public final double empiricalQualDouble() { return empiricalQualDouble( 0, QualityUtils.MAX_REASONABLE_Q_SCORE ); } // 'default' behavior is to use smoothing value of zero + + public final byte empiricalQualByte( final int smoothing ) { + final double doubleMismatches = (double) ( numMismatches + smoothing ); + final double doubleObservations = (double) ( numObservations + smoothing ); + return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); // This is capped at Q40 + } + public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero + + //--------------------------------------------------------------------------------------------------------------- + // + // misc. methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final long getNumObservations() { + return numObservations; + } + + public final long getNumMismatches() { + return numMismatches; + } + + public final String outputToCSV( ) { + return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); + } + public final String outputToCSV( final int smoothing ) { + return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte(smoothing) ); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalibrationArgumentCollection.java new file mode 100755 index 000000000..996446b35 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelCountCovariates/RecalibrationArgumentCollection.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates; + +import org.broadinstitute.sting.commandline.Argument; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 27, 2009 + * + * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker. + * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. + */ + +public class RecalibrationArgumentCollection { + + ////////////////////////////////// + // Shared Command Line Arguments + ////////////////////////////////// + @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") + public String DEFAULT_READ_GROUP = null; + @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + public String DEFAULT_PLATFORM = null; + @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") + public String FORCE_READ_GROUP = null; + @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + public String FORCE_PLATFORM = null; + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) + public int WINDOW_SIZE = 5; + @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) + public int HOMOPOLYMER_NBACK = 7; + @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) + public boolean EXCEPTION_IF_NO_TILE = false; + @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; + @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false) + public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; +} \ No newline at end of file