();
- PrintStream o;
- try {
- o = new PrintStream(output);
- } catch ( FileNotFoundException e) {
- throw new UserException("File to be output by CountCovariates Gather function was not found");
- }
-
- boolean sawEOF = false;
- boolean printedHeader = false;
-
- // Read input files
- for ( File RECAL_FILE : inputs) {
- try {
- for ( String line : new XReadLines(RECAL_FILE) ) {
- if ( EOF_MARKER.equals(line) ) {
- sawEOF = true; // sanity check
- }
- else if(COMMENT_PATTERN.matcher(line).matches()) {
- ; // It doesn't make any sense to print intermediate comments, unless we merge them somehow (would require strict definition for the header)
- }
- else if (COVARIATE_PATTERN.matcher(line).matches()) {
- if (!printedHeader)
- o.println(line);
- }
- else { // Found a line of data
- addCSVData(line); // Parse the line and add the data to the HashMap
- }
- }
-
- } catch ( FileNotFoundException e ) {
- throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
- }
-
- if ( !sawEOF ) {
- final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
- throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
- }
- printedHeader = true;
- }
-
- // Write output file from dataMap
- for(String key : dataMap.keySet()) {
- RecalDatumOptimized values = dataMap.get(key);
- String v = values.getNumObservations() + "," + values.getNumMismatches() + "," + values.empiricalQualByte();
- o.println(key + v);
- }
- o.println("EOF");
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
deleted file mode 100755
index a99f35f45..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broad.tribble.Feature;
-import org.broadinstitute.sting.commandline.*;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
-import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.classloader.PluginManager;
-import org.broadinstitute.sting.utils.collections.NestedHashMap;
-import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-/**
- * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
- *
- *
- * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
- * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
- * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
- * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical
- * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
- * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score).
- *
- * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
- *
- *
- * See the GATK wiki for a tutorial and example recalibration accuracy plots.
- * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
- *
- *
Input
- *
- * The input read data whose base quality scores need to be assessed.
- *
- * A database of known polymorphic sites to skip over.
- *
- *
- * Output
- *
- * A recalibration table file in CSV format that is used by the TableRecalibration walker.
- * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
- *
- * The first 20 lines of such a file is shown below.
- * * The file begins with a series of comment lines describing:
- * ** The number of counted loci
- * ** The number of counted bases
- * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
- *
- * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
- *
- * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
- * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
- * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
- *
- *
- * # Counted Sites 19451059
- * # Counted Bases 56582018
- * # Skipped Sites 82666
- * # Fraction Skipped 1 / 235 bp
- * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
- * SRR006446,11,65,CA,9,1,10
- * SRR006446,11,48,TA,10,0,40
- * SRR006446,11,67,AA,27,0,40
- * SRR006446,11,61,GA,11,1,10
- * SRR006446,12,34,CA,47,1,17
- * SRR006446,12,30,GA,52,1,17
- * SRR006446,12,36,AA,352,1,25
- * SRR006446,12,17,TA,182,11,12
- * SRR006446,11,48,TG,2,0,40
- * SRR006446,11,67,AG,1,0,40
- * SRR006446,12,34,CG,9,0,40
- * SRR006446,12,30,GG,43,0,40
- * ERR001876,4,31,AG,1,0,40
- * ERR001876,4,31,AT,2,2,1
- * ERR001876,4,31,CA,1,0,40
- *
- *
- *
- * Examples
- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- * -R resources/Homo_sapiens_assembly18.fasta \
- * -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
- * -knownSites another/optional/setOfSitesToMask.vcf \
- * -I my_reads.bam \
- * -T CountCovariates \
- * -cov ReadGroupCovariate \
- * -cov QualityScoreCovariate \
- * -cov CycleCovariate \
- * -cov DinucCovariate \
- * -recalFile my_reads.recal_data.csv
- *
- */
-
-@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
-@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
-@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
-// Filter out all reads with zero or unavailable mapping quality
-@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
-// This walker requires both -I input.bam and -R reference.fasta
-@PartitionBy(PartitionType.LOCUS)
-public class CountCovariatesWalker extends LocusWalker implements TreeReducible {
-
- /////////////////////////////
- // Constants
- /////////////////////////////
- private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped.
- private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed.
- private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord.
-
- /////////////////////////////
- // Shared Arguments
- /////////////////////////////
- @ArgumentCollection
- private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
-
- /////////////////////////////
- // Command Line Arguments
- /////////////////////////////
- /**
- * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
- * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
- * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
- * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
- */
- @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
- public List> knownSites = Collections.emptyList();
-
- /**
- * After the header, data records occur one per line until the end of the file. The first several items on a line are the
- * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
- * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
- * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
- */
- @Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
- @Gather(CountCovariatesGatherer.class)
- public PrintStream RECAL_FILE;
-
- @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
- private boolean LIST_ONLY = false;
-
- /**
- * See the -list argument to view available covariates.
- */
- @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
- private String[] COVARIATES = null;
- @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
- private boolean USE_STANDARD_COVARIATES = false;
-
- /////////////////////////////
- // Debugging-only Arguments
- /////////////////////////////
- @Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
- private boolean DONT_SORT_OUTPUT = false;
-
- /**
- * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
- */
- @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
- private boolean RUN_WITHOUT_DBSNP = false;
-
- /////////////////////////////
- // Private Member Variables
- /////////////////////////////
- private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
- private final ArrayList requestedCovariates = new ArrayList(); // A list to hold the covariate objects that were requested
- private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
- private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
-
- public static class CountedData {
- private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
- private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
- private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
- private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
- private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
-
- private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci
- private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci
- private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
-
- /**
- * Adds the values of other to this, returning this
- *
- * @param other
- * @return this object
- */
- public CountedData add(CountedData other) {
- countedSites += other.countedSites;
- countedBases += other.countedBases;
- skippedSites += other.skippedSites;
- solidInsertedReferenceBases += other.solidInsertedReferenceBases;
- otherColorSpaceInconsistency += other.otherColorSpaceInconsistency;
- dbSNPCountsMM += other.dbSNPCountsMM;
- dbSNPCountsBases += other.dbSNPCountsBases;
- novelCountsMM += other.novelCountsMM;
- novelCountsBases += other.novelCountsBases;
- lociSinceLastDbsnpCheck += other.lociSinceLastDbsnpCheck;
- return this;
- }
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // initialize
- //
- //---------------------------------------------------------------------------------------------------------------
-
- /**
- * Parse the -cov arguments and create a list of covariates to be used here
- * Based on the covariates' estimates for initial capacity allocate the data hashmap
- */
- public void initialize() {
-
- if (RAC.FORCE_PLATFORM != null) {
- RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
- }
-
- // Get a list of all available covariates
- final List> covariateClasses = new PluginManager(Covariate.class).getPlugins();
- final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins();
- final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins();
-
- // Print and exit if that's what was requested
- if (LIST_ONLY) {
- logger.info("Available covariates:");
- for (Class> covClass : covariateClasses) {
- logger.info(covClass.getSimpleName());
- }
- logger.info("");
-
- System.exit(0); // Early exit here because user requested it
- }
-
- // Warn the user if no dbSNP file or other variant mask was specified
- if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
- throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
- }
-
- // Initialize the requested covariates by parsing the -cov argument
- // First add the required covariates
- if (requiredClasses.size() == 2) { // readGroup and reported quality score
- requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
- requestedCovariates.add(new QualityScoreCovariate());
- }
- else {
- throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
- }
- // Next add the standard covariates if -standard was specified by the user
- if (USE_STANDARD_COVARIATES) {
- // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
- // A list of Classes can't be sorted, but a list of Class names can be
- final List standardClassNames = new ArrayList();
- for (Class> covClass : standardClasses) {
- standardClassNames.add(covClass.getName());
- }
- Collections.sort(standardClassNames); // Sort the list of class names
- for (String className : standardClassNames) {
- for (Class> covClass : standardClasses) { // Find the class that matches this class name
- if (covClass.getName().equals(className)) {
- try {
- final Covariate covariate = (Covariate) covClass.newInstance();
- requestedCovariates.add(covariate);
- } catch (Exception e) {
- throw new DynamicClassResolutionException(covClass, e);
- }
- }
- }
- }
- }
- // Finally parse the -cov arguments that were provided, skipping over the ones already specified
- if (COVARIATES != null) {
- for (String requestedCovariateString : COVARIATES) {
- boolean foundClass = false;
- for (Class> covClass : covariateClasses) {
- if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
- foundClass = true;
- if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
- try {
- // Now that we've found a matching class, try to instantiate it
- final Covariate covariate = (Covariate) covClass.newInstance();
- requestedCovariates.add(covariate);
- } catch (Exception e) {
- throw new DynamicClassResolutionException(covClass, e);
- }
- }
- }
- }
-
- if (!foundClass) {
- throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
- }
- }
- }
-
- logger.info("The covariates being used here: ");
- for (Covariate cov : requestedCovariates) {
- logger.info("\t" + cov.getClass().getSimpleName());
- cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
- }
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // map
- //
- //---------------------------------------------------------------------------------------------------------------
-
- /**
- * For each read at this locus get the various covariate values and increment that location in the map based on
- * whether or not the base matches the reference at this particular location
- *
- * @param tracker The reference metadata tracker
- * @param ref The reference context
- * @param context The alignment context
- * @return Returns 1, but this value isn't used in the reduce step
- */
- public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
- // Only use data from non-dbsnp sites
- // Assume every mismatch at a non-dbsnp site is indicative of poor quality
- CountedData counter = new CountedData();
- if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
- // For each read at this locus
- for (final PileupElement p : context.getBasePileup()) {
- final GATKSAMRecord gatkRead = p.getRead();
- int offset = p.getOffset();
-
- if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
- continue;
- }
-
- if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
- gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
- RecalDataManager.parseSAMRecord(gatkRead, RAC);
-
- // Skip over reads with no calls in the color space if the user requested it
- if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
- gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
- continue;
- }
-
- RecalDataManager.parseColorSpace(gatkRead);
- gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates));
- }
-
- // Skip this position if base quality is zero
- if (gatkRead.getBaseQualities()[offset] > 0) {
-
- byte[] bases = gatkRead.getReadBases();
- byte refBase = ref.getBase();
-
- // Skip if this base is an 'N' or etc.
- if (BaseUtils.isRegularBase(bases[offset])) {
-
- // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
- if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
- !RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
-
- // This base finally passed all the checks for a good base, so add it to the big data hashmap
- updateDataFromRead(counter, gatkRead, offset, refBase);
-
- }
- else { // calculate SOLID reference insertion rate
- if (refBase == bases[offset]) {
- counter.solidInsertedReferenceBases++;
- }
- else {
- counter.otherColorSpaceInconsistency++;
- }
- }
- }
- }
- }
- counter.countedSites++;
- }
- else { // We skipped over the dbSNP site, and we are only processing every Nth locus
- counter.skippedSites++;
- updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
- }
-
- return counter;
- }
-
- /**
- * Update the mismatch / total_base counts for a given class of loci.
- *
- * @param counter The CountedData to be updated
- * @param context The AlignmentContext which holds the reads covered by this locus
- * @param refBase The reference base
- */
- private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
- for (PileupElement p : context.getBasePileup()) {
- final byte readBase = p.getBase();
- final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
- final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
-
- if (readBaseIndex != -1 && refBaseIndex != -1) {
- if (readBaseIndex != refBaseIndex) {
- counter.novelCountsMM++;
- }
- counter.novelCountsBases++;
- }
- }
- }
-
- /**
- * Major workhorse routine for this walker.
- * Loop through the list of requested covariates and pick out the value from the read, offset, and reference
- * Using the list of covariate values as a key, pick out the RecalDatum and increment,
- * adding one to the number of observations and potentially one to the number of mismatches
- * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
- * because pulling things out of the SAMRecord is an expensive operation.
- *
- * @param counter Data structure which holds the counted bases
- * @param gatkRead The SAMRecord holding all the data for this read
- * @param offset The offset in the read for this locus
- * @param refBase The reference base at this locus
- */
- private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
- final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
- final Object[] key = covars[offset];
-
- // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
- final NestedHashMap data = dataManager.data; //optimization - create local reference
- RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
- if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
- // initialized with zeros, will be incremented at end of method
- datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
- }
-
- // Need the bases to determine whether or not we have a mismatch
- final byte base = gatkRead.getReadBases()[offset];
- final long curMismatches = datum.getNumMismatches();
-
- // Add one to the number of observations and potentially one to the number of mismatches
- datum.incrementBaseCounts(base, refBase);
- counter.countedBases++;
- counter.novelCountsBases++;
- counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // reduce
- //
- //---------------------------------------------------------------------------------------------------------------
-
- /**
- * Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
- *
- * @return returns A PrintStream created from the -recalFile filename argument specified to the walker
- */
- public CountedData reduceInit() {
- return new CountedData();
- }
-
- /**
- * The Reduce method doesn't do anything for this walker.
- *
- * @param mapped Result of the map. This value is immediately ignored.
- * @param sum The summing CountedData used to output the CSV data
- * @return returns The sum used to output the CSV data
- */
- public CountedData reduce(CountedData mapped, CountedData sum) {
- // Do a dbSNP sanity check every so often
- return validatingDbsnpMismatchRate(sum.add(mapped));
- }
-
- /**
- * Validate the dbSNP reference mismatch rates.
- */
- private CountedData validatingDbsnpMismatchRate(CountedData counter) {
- if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
- counter.lociSinceLastDbsnpCheck = 0;
-
- if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
- final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
- final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
-
- if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
- Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
- DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
- }
- }
- }
-
- return counter;
- }
-
- public CountedData treeReduce(CountedData sum1, CountedData sum2) {
- return validatingDbsnpMismatchRate(sum1.add(sum2));
- }
-
- /**
- * Write out the full data hashmap to disk in CSV format
- *
- * @param sum The CountedData to write out to RECAL_FILE
- */
- public void onTraversalDone(CountedData sum) {
- logger.info("Writing raw recalibration data...");
- if (sum.countedBases == 0L) {
- throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
- }
- outputToCSV(sum, RECAL_FILE);
- logger.info("...done!");
- }
-
- /**
- * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
- *
- * @param recalTableStream The PrintStream to write out to
- */
- private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
- recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
- recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
- recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
- recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
-
- if (sum.solidInsertedReferenceBases != 0) {
- recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
- recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
- }
-
- // Output header saying which covariates were used and in what order
- for (Covariate cov : requestedCovariates) {
- recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
- }
- recalTableStream.println("nObservations,nMismatches,Qempirical");
-
- if (DONT_SORT_OUTPUT) {
- printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
- }
- else {
- printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
- }
-
- // print out an EOF marker
- recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
- }
-
- private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
- final ArrayList keyList = new ArrayList();
- for (Object comp : data.keySet()) {
- keyList.add((Comparable) comp);
- }
-
- Collections.sort(keyList);
-
- for (Comparable comp : keyList) {
- key[curPos] = comp;
- final Object val = data.get(comp);
- if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
- // For each Covariate in the key
- for (Object compToPrint : key) {
- // Output the Covariate's value
- recalTableStream.print(compToPrint + ",");
- }
- // Output the RecalDatum entry
- recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
- }
- else { // Another layer in the nested hash map
- printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
- }
- }
- }
-
- private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
- for (Object comp : data.keySet()) {
- key[curPos] = comp;
- final Object val = data.get(comp);
- if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
- // For each Covariate in the key
- for (Object compToPrint : key) {
- // Output the Covariate's value
- recalTableStream.print(compToPrint + ",");
- }
- // Output the RecalDatum entry
- recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
- }
- else { // Another layer in the nested hash map
- printMappings(recalTableStream, curPos + 1, key, (Map) val);
- }
- }
- }
-}
-
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java
deleted file mode 100755
index 9d5747023..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java
+++ /dev/null
@@ -1,56 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Oct 30, 2009
- *
- * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
- * In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
- * This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
- */
-
-public interface Covariate {
- public void initialize(RecalibrationArgumentCollection RAC); // Initialize any member variables using the command-line arguments passed to the walkers
-
- public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
-
- public void getValues(GATKSAMRecord read, Comparable[] comparable);
- //Takes an array of size (at least) read.getReadLength() and fills it with covariate
- //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
- //read-specific calculations to be done just once rather than for each offset.
-}
-
-interface RequiredCovariate extends Covariate {}
-
-interface StandardCovariate extends Covariate {}
-
-interface ExperimentalCovariate extends Covariate {}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java
deleted file mode 100755
index b8d13ca10..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java
+++ /dev/null
@@ -1,212 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.NGSPlatform;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.util.EnumSet;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Oct 30, 2009
- *
- * The Cycle covariate.
- * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
- * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
- * For example, for the read: AAACCCCGAAATTTTTACTG
- * the cycle would be 11111111222333333344
- * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
- */
-
-public class CycleCovariate implements StandardCovariate {
- private final static EnumSet DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS);
- private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- if (RAC.DEFAULT_PLATFORM != null) {
- if (RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SLX") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ILLUMINA") ||
- RAC.DEFAULT_PLATFORM.contains("454") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SOLID") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ABI_SOLID")) {
- // nothing to do
- }
- else {
- throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform. Implemented options are illumina, 454, and solid");
- }
- }
- }
-
- // Used to pick out the covariate's value from attributes of the read
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
-
- //-----------------------------
- // Illumina, Solid, PacBio, and Complete Genomics
- //-----------------------------
-
- final NGSPlatform ngsPlatform = read.getNGSPlatform();
- if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
- final int init;
- final int increment;
- if (!read.getReadNegativeStrandFlag()) {
- // Differentiate between first and second of pair.
- // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
- // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
- // Therefore the cycle covariate must differentiate between first and second of pair reads.
- // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
- // the current sequential model would consider the effects independently instead of jointly.
- if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
- //second of pair, positive strand
- init = -1;
- increment = -1;
- }
- else {
- //first of pair, positive strand
- init = 1;
- increment = 1;
- }
-
- }
- else {
- if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
- //second of pair, negative strand
- init = -read.getReadLength();
- increment = 1;
- }
- else {
- //first of pair, negative strand
- init = read.getReadLength();
- increment = -1;
- }
- }
-
- int cycle = init;
- for (int i = 0; i < read.getReadLength(); i++) {
- comparable[i] = cycle;
- cycle += increment;
- }
- }
-
- //-----------------------------
- // 454 and Ion Torrent
- //-----------------------------
- else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
-
- final int readLength = read.getReadLength();
- final byte[] bases = read.getReadBases();
-
- // Differentiate between first and second of pair.
- // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
- // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
- // Therefore the cycle covariate must differentiate between first and second of pair reads.
- // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
- // the current sequential model would consider the effects independently instead of jointly.
- final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
-
- int cycle = multiplyByNegative1 ? -1 : 1;
-
- // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
- // For example, AAAAAAA was probably read in two flow cycles but here we count it as one
- if (!read.getReadNegativeStrandFlag()) { // Forward direction
- int iii = 0;
- while (iii < readLength) {
- while (iii < readLength && bases[iii] == (byte) 'T') {
- comparable[iii] = cycle;
- iii++;
- }
- while (iii < readLength && bases[iii] == (byte) 'A') {
- comparable[iii] = cycle;
- iii++;
- }
- while (iii < readLength && bases[iii] == (byte) 'C') {
- comparable[iii] = cycle;
- iii++;
- }
- while (iii < readLength && bases[iii] == (byte) 'G') {
- comparable[iii] = cycle;
- iii++;
- }
- if (iii < readLength) {
- if (multiplyByNegative1)
- cycle--;
- else
- cycle++;
- }
- if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
- comparable[iii] = cycle;
- iii++;
- }
-
- }
- }
- else { // Negative direction
- int iii = readLength - 1;
- while (iii >= 0) {
- while (iii >= 0 && bases[iii] == (byte) 'T') {
- comparable[iii] = cycle;
- iii--;
- }
- while (iii >= 0 && bases[iii] == (byte) 'A') {
- comparable[iii] = cycle;
- iii--;
- }
- while (iii >= 0 && bases[iii] == (byte) 'C') {
- comparable[iii] = cycle;
- iii--;
- }
- while (iii >= 0 && bases[iii] == (byte) 'G') {
- comparable[iii] = cycle;
- iii--;
- }
- if (iii >= 0) {
- if (multiplyByNegative1)
- cycle--;
- else
- cycle++;
- }
- if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
- comparable[iii] = cycle;
- iii--;
- }
- }
- }
- }
- else {
- throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java
deleted file mode 100755
index 9e1c2fe1f..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java
+++ /dev/null
@@ -1,71 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 16, 2009
- */
-public class Dinuc implements Comparable{
- private byte first;
- private byte second;
-
- public Dinuc() {
- first = 0;
- second = 0;
- }
-
- public Dinuc(final byte _first, final byte _second) {
- first = _first;
- second = _second;
- }
-
- public final void setValues(final byte _first, final byte _second) {
- first = _first;
- second = _second;
- }
-
- public int compareTo(final Dinuc that) {
- if( this.first > that.first ) { return 1; }
- else if( this.first < that.first ) { return -1; }
- else { //this.first equals that.first
- if( this.second > that.second ) { return 1; }
- else if( this.second < that.second ) { return -1; }
- else { return 0; }
- }
-
- }
-
- public static int hashBytes(final byte byte1, final byte byte2) {
- return byte1 << 8 + byte2;
- }
-
- public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file
- byte[] byteArray = {first,second};
- return new String(byteArray);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java
deleted file mode 100755
index 9a401d09f..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java
+++ /dev/null
@@ -1,129 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.util.HashMap;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 3, 2009
- *
- * The Dinucleotide covariate. This base and the one that came before it in the read, remembering to swap directions if negative strand read.
- * This covariate assumes that the bases have been swapped to their complement base counterpart if this is a negative strand read.
- * This assumption is made to speed up the code.
- */
-
-public class DinucCovariate implements StandardCovariate {
-
- private static final byte NO_CALL = (byte) 'N';
- private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
-
- private HashMap dinucHashMap;
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- final byte[] BASES = {(byte) 'A', (byte) 'C', (byte) 'G', (byte) 'T'};
- dinucHashMap = new HashMap();
- for (byte byte1 : BASES) {
- for (byte byte2 : BASES) {
- dinucHashMap.put(Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2)); // This might seem silly, but Strings are too slow
- }
- }
- // Add the "no dinuc" entry too
- dinucHashMap.put(Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
- }
-
- /**
- * Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
- */
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- final HashMap dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
- final int readLength = read.getReadLength();
- final boolean negativeStrand = read.getReadNegativeStrandFlag();
- byte[] bases = read.getReadBases();
- byte base;
- byte prevBase;
- int offset = 0;
- // If this is a negative strand read then we need to reverse the direction for our previous base
-
- if (negativeStrand) {
- bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
- }
- comparable[0] = NO_DINUC; // No dinuc at the beginning of the read
-
- prevBase = bases[0];
- offset++;
- while (offset < readLength) {
- // Note: We are using the previous base in the read, not the
- // previous base in the reference. This is done in part to be consistent with unmapped reads.
- base = bases[offset];
- if (BaseUtils.isRegularBase(prevBase)) {
- comparable[offset] = dinucHashMapRef.get(Dinuc.hashBytes(prevBase, base));
- }
- else {
- comparable[offset] = NO_DINUC;
- }
-
- offset++;
- prevBase = base;
- }
- if (negativeStrand) {
- reverse(comparable);
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- byte[] bytes = str.getBytes();
- final Dinuc returnDinuc = dinucHashMap.get(Dinuc.hashBytes(bytes[0], bytes[1]));
- if (returnDinuc.compareTo(NO_DINUC) == 0) {
- return null;
- }
- return returnDinuc;
- }
-
- /**
- * Reverses the given array in place.
- *
- * @param array any array
- */
- private static void reverse(final Comparable[] array) {
- final int arrayLength = array.length;
- for (int l = 0, r = arrayLength - 1; l < r; l++, r--) {
- final Comparable temp = array[l];
- array[l] = array[r];
- array[r] = temp;
- }
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/EmpiricalQual.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/EmpiricalQual.java
deleted file mode 100755
index e9bfa3513..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/EmpiricalQual.java
+++ /dev/null
@@ -1,55 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: carneiro
- * Date: Mar 22, 2012
- *
- * Object that holds the empirical quality and estimated reported quality values for on-the-fly recalibration. This is a simplification of the RecalDatum object
- */
-
-public class EmpiricalQual {
-
- private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
- private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
-
- private EmpiricalQual() {}
-
- public EmpiricalQual(final double estimatedQReported, final double empiricalQuality) {
- this.estimatedQReported = estimatedQReported;
- this.empiricalQuality = empiricalQuality;
- }
-
- public final double getEstimatedQReported() {
- return estimatedQReported;
- }
-
- public final double getEmpiricalQuality() {
- return empiricalQuality;
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java
deleted file mode 100755
index 14ffd35a4..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Jan 29, 2010
- *
- * The number of previous N bases (along the direction of the read) that contain G's and C's. The goal is to correct for dye slippage.
- * Only valid for Illumina reads. Otherwise return -1.
- */
-
-public class GCContentCovariate implements ExperimentalCovariate {
-
- private int numBack = 7;
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- numBack = RAC.HOMOPOLYMER_NBACK;
- }
-
- // Used to pick out the covariate's value from attributes of the read
- private Comparable getValue(final SAMRecord read, final int offset) {
-
- // ATTGCCCCGTAAAAAAAGAGAA
- // 0000123456654321001122
-
- if (read.getReadGroup().getPlatform().equalsIgnoreCase("ILLUMINA") || read.getReadGroup().getPlatform().equalsIgnoreCase("SLX")) {
- int numGC = 0;
- int startPos;
- int stopPos;
- final byte[] bases = read.getReadBases();
- if (!read.getReadNegativeStrandFlag()) { // Forward direction
- startPos = Math.max(offset - numBack, 0);
- stopPos = Math.max(offset - 1, 0);
- }
- else { // Negative direction
- startPos = Math.min(offset + 2, bases.length);
- stopPos = Math.min(offset + numBack + 1, bases.length);
- }
-
- for (int iii = startPos; iii < stopPos; iii++) {
- if (bases[iii] == (byte) 'G' || bases[iii] == (byte) 'C') {
- numGC++;
- }
- }
-
- return numGC;
- }
- else { // This effect is specific to the Illumina platform
- return -1;
- }
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- for (int iii = 0; iii < read.getReadLength(); iii++) {
- comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java
deleted file mode 100755
index 004fb0bdb..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java
+++ /dev/null
@@ -1,109 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Dec 4, 2009
- *
- * The Homopolymer Run Covariate. This is the number of consecutive bases in the previous N that match the current base.
- * For example, if N = 10:
- * ATTGCCCCGTAAAAAAAAATA
- * 001001230001234567800
- */
-
-public class HomopolymerCovariate implements ExperimentalCovariate {
-
- private int numBack;
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- numBack = RAC.HOMOPOLYMER_NBACK;
- }
-
- // Used to pick out the covariate's value from attributes of the read
- private Comparable getValue(final SAMRecord read, final int offset) {
-
- // This block of code is for if you don't want to only count consecutive bases
- // ATTGCCCCGTAAAAAAAAATA
- // 001001231211234567819
- /*
- int numAgree = 0; // The number of bases that agree with you in the previous numBack bases of the read
- int startPos = 0;
- int stopPos = 0;
- byte[] bases = read.getReadBases();
- byte thisBase = bases[offset];
- if( !read.getReadNegativeStrandFlag() ) { // Forward direction
- startPos = Math.max(offset - numBack, 0);
- stopPos = Math.max(offset - 1, 0);
- } else { // Negative direction
- startPos = Math.min(offset + 2, bases.length);
- stopPos = Math.min(offset + numBack + 1, bases.length);
- }
-
- for( int iii = startPos; iii < stopPos; iii++ ) {
- if( bases[iii] == thisBase ) { numAgree++; }
- }
- */
-
- int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read
- final byte[] bases = read.getReadBases();
- int iii = offset;
- if (!read.getReadNegativeStrandFlag()) { // Forward direction
- while (iii <= bases.length - 2 && bases[iii] == bases[iii + 1] && numAgree < numBack) {
- numAgree++;
- iii++;
- }
- }
- else { // Negative direction
- while (iii >= 1 && bases[iii] == bases[iii - 1] && numAgree < numBack) {
- numAgree++;
- iii--;
- }
- }
-
- return numAgree;
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- for (int iii = 0; iii < read.getReadLength(); iii++) {
- comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java
deleted file mode 100755
index 54fa18106..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java
+++ /dev/null
@@ -1,63 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 4, 2009
- *
- * The Mapping Quality covariate.
- */
-
-public class MappingQualityCovariate implements ExperimentalCovariate {
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- }
-
- // Used to pick out the covariate's value from attributes of the read
- private Comparable getValue(final GATKSAMRecord read) {
- return read.getMappingQuality();
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- for (int iii = 0; iii < read.getReadLength(); iii++) {
- comparable[iii] = getValue(read); // BUGBUG: this can be optimized
- }
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java
deleted file mode 100755
index ecaa55006..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java
+++ /dev/null
@@ -1,79 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 4, 2009
- *
- * The Minimum Neighborhood Quality Score covariate, originally described by Chris Hartl.
- * This covariate is the minimum base quality score in the read in a small window around the current base.
- */
-
-public class MinimumNQSCovariate implements ExperimentalCovariate {
-
- private int windowReach; // How far in each direction from the current base to look
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- windowReach = RAC.WINDOW_SIZE / 2; // integer division
- }
-
- // Used to pick out the covariate's value from attributes of the read
- private Comparable getValue(final SAMRecord read, final int offset) {
-
- // Loop over the list of base quality scores in the window and find the minimum
- final byte[] quals = read.getBaseQualities();
- int minQual = quals[offset];
- final int minIndex = Math.max(offset - windowReach, 0);
- final int maxIndex = Math.min(offset + windowReach, quals.length - 1);
- for (int iii = minIndex; iii < maxIndex; iii++) {
- if (quals[iii] < minQual) {
- minQual = quals[iii];
- }
- }
- return minQual;
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- for (int iii = 0; iii < read.getReadLength(); iii++) {
- comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java
deleted file mode 100755
index fd720697f..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java
+++ /dev/null
@@ -1,69 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 18, 2009
- *
- * The Position covariate. It is a lot like the Cycle covariate except it always returns the offset regardless of which platform the read came from.
- * This is the Solexa definition of machine cycle and the covariate that was always being used in the original version of the recalibrator.
- */
-
-public class PositionCovariate implements ExperimentalCovariate {
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- }
-
- // Used to pick out the covariate's value from attributes of the read
- private Comparable getValue(final SAMRecord read, final int offset) {
- int cycle = offset;
- if (read.getReadNegativeStrandFlag()) {
- cycle = read.getReadLength() - (offset + 1);
- }
- return cycle;
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- for (int iii = 0; iii < read.getReadLength(); iii++) {
- comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java
deleted file mode 100755
index d6bdea5bf..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java
+++ /dev/null
@@ -1,76 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 13, 2009
- *
- * The Primer Round covariate.
- * For Solexa and 454 this is the same value of the length of the read.
- * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
- */
-
-public class PrimerRoundCovariate implements ExperimentalCovariate {
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- }
-
- // Used to pick out the covariate's value from attributes of the read
- private Comparable getValue(final SAMRecord read, final int offset) {
- if (read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || read.getReadGroup().getPlatform().equalsIgnoreCase("ABI_SOLID")) {
- int pos = offset;
- if (read.getReadNegativeStrandFlag()) {
- pos = read.getReadLength() - (offset + 1);
- }
- return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
- }
- else {
- return 1; // nothing to do here because it is always the same
- }
-
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- for (int iii = 0; iii < read.getReadLength(); iii++) {
- comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java
deleted file mode 100755
index a29a0530c..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java
+++ /dev/null
@@ -1,61 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.util.Arrays;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 3, 2009
- *
- * The Reported Quality Score covariate.
- */
-
-public class QualityScoreCovariate implements RequiredCovariate {
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- byte[] baseQualities = read.getBaseQualities();
- for (int i = 0; i < read.getReadLength(); i++) {
- comparable[i] = (int) baseQualities[i];
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return Integer.parseInt(str);
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java
deleted file mode 100755
index 33adf4417..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java
+++ /dev/null
@@ -1,61 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Oct 30, 2009
- *
- * The Read Group covariate.
- */
-
-public class ReadGroupCovariate implements RequiredCovariate {
-
- // Initialize any member variables using the command-line arguments passed to the walkers
- @Override
- public void initialize(final RecalibrationArgumentCollection RAC) {
- }
-
- @Override
- public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
- final String readGroupId = read.getReadGroup().getReadGroupId();
- for (int i = 0; i < read.getReadLength(); i++) {
- comparable[i] = readGroupId;
- }
- }
-
- // Used to get the covariate's value from input csv file in TableRecalibrationWalker
- @Override
- public final Comparable getValue(final String str) {
- return str;
- }
-}
-
-
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
deleted file mode 100644
index 1a6b8cfcb..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ /dev/null
@@ -1,697 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.SAMUtils;
-import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.collections.NestedHashMap;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.AlignmentUtils;
-import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 6, 2009
- *
- * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
- * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
- * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration.
- */
-
-public class RecalDataManager {
-
- public final NestedHashMap data; // The full dataset
- private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
- private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
- private final ArrayList dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
-
- public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
- public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
- public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
- public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
- private static boolean warnUserNullPlatform = false;
-
- public enum SOLID_RECAL_MODE {
- /**
- * Treat reference inserted bases as reference matching bases. Very unsafe!
- */
- DO_NOTHING,
- /**
- * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option.
- */
- SET_Q_ZERO,
- /**
- * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV.
- */
- SET_Q_ZERO_BASE_N,
- /**
- * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
- */
- REMOVE_REF_BIAS
- }
-
- public enum SOLID_NOCALL_STRATEGY {
- /**
- * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option.
- */
- THROW_EXCEPTION,
- /**
- * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare.
- */
- LEAVE_READ_UNRECALIBRATED,
- /**
- * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
- */
- PURGE_READ
- }
-
- public RecalDataManager() {
- data = new NestedHashMap();
- dataCollapsedReadGroup = null;
- dataCollapsedQualityScore = null;
- dataCollapsedByCovariate = null;
- }
-
- public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
- if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
- data = null;
- dataCollapsedReadGroup = new NestedHashMap();
- dataCollapsedQualityScore = new NestedHashMap();
- dataCollapsedByCovariate = new ArrayList();
- for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
- dataCollapsedByCovariate.add(new NestedHashMap());
- }
- }
- else {
- data = new NestedHashMap();
- dataCollapsedReadGroup = null;
- dataCollapsedQualityScore = null;
- dataCollapsedByCovariate = null;
- }
- }
-
- /**
- * Add the given mapping to all of the collapsed hash tables
- *
- * @param key The list of comparables that is the key for this mapping
- * @param fullDatum The RecalDatum which is the data for this mapping
- * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
- */
- public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) {
-
- // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
- //data.put(key, thisDatum); // add the mapping to the main table
-
- final int qualityScore = Integer.parseInt(key[1].toString());
- final Object[] readGroupCollapsedKey = new Object[1];
- final Object[] qualityScoreCollapsedKey = new Object[2];
- final Object[] covariateCollapsedKey = new Object[3];
- RecalDatum collapsedDatum;
-
- // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
- if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
- readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
- collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey);
- if (collapsedDatum == null) {
- dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey);
- }
- else {
- collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
- }
- }
-
- // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
- qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
- qualityScoreCollapsedKey[1] = key[1]; // and quality score
- collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey);
- if (collapsedDatum == null) {
- dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
- }
- else {
- collapsedDatum.increment(fullDatum);
- }
-
- // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
- for (int iii = 0; iii < dataCollapsedByCovariate.size(); iii++) {
- covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
- covariateCollapsedKey[1] = key[1]; // and quality score ...
- final Object theCovariateElement = key[iii + 2]; // and the given covariate
- if (theCovariateElement != null) {
- covariateCollapsedKey[2] = theCovariateElement;
- collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey);
- if (collapsedDatum == null) {
- dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
- }
- else {
- collapsedDatum.increment(fullDatum);
- }
- }
- }
- }
-
- /**
- * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
- * that will be used in the sequential calculation in TableRecalibrationWalker
- *
- * @param smoothing The smoothing parameter that goes into empirical quality score calculation
- * @param maxQual At which value to cap the quality scores
- */
- public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
-
- recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
- recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
- for (NestedHashMap map : dataCollapsedByCovariate) {
- recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
- checkForSingletons(map.data);
- }
- }
-
- private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
-
- for (Object comp : data.keySet()) {
- final Object val = data.get(comp);
- if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
- ((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
- }
- else { // Another layer in the nested hash map
- recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
- }
- }
- }
-
- private void checkForSingletons(final Map data) {
- // todo -- this looks like it's better just as a data.valueSet() call?
- for (Object comp : data.keySet()) {
- final Object val = data.get(comp);
- if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
- if (data.keySet().size() == 1) {
- data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
- // in a previous step of the sequential calculation model
- }
- }
- else { // Another layer in the nested hash map
- checkForSingletons((Map) val);
- }
- }
- }
-
- /**
- * Get the appropriate collapsed table out of the set of all the tables held by this Object
- *
- * @param covariate Which covariate indexes the desired collapsed HashMap
- * @return The desired collapsed HashMap
- */
- public final NestedHashMap getCollapsedTable(final int covariate) {
- if (covariate == 0) {
- return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
- }
- else if (covariate == 1) {
- return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
- }
- else {
- return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
- }
- }
-
- /**
- * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
- *
- * @param read The read to adjust
- * @param RAC The list of shared command line arguments
- */
- public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
- GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
-
- if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
- readGroup.setPlatform(RAC.FORCE_PLATFORM);
- }
-
- if (readGroup.getPlatform() == null) {
- if (RAC.DEFAULT_PLATFORM != null) {
- if (!warnUserNullPlatform) {
- Utils.warnUser("The input .bam file contains reads with no platform information. " +
- "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " +
- "First observed at read with name = " + read.getReadName());
- warnUserNullPlatform = true;
- }
- readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
- }
- else {
- throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName());
- }
- }
- }
-
- /**
- * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
- *
- * @param read The SAMRecord to parse
- */
- public static void parseColorSpace(final GATKSAMRecord read) {
-
- // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
- if (ReadUtils.isSOLiDRead(read)) {
- if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
- final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
- if (attr != null) {
- byte[] colorSpace;
- if (attr instanceof String) {
- colorSpace = ((String) attr).getBytes();
- }
- else {
- throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
- }
-
- // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
- byte[] readBases = read.getReadBases();
- if (read.getReadNegativeStrandFlag()) {
- readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
- }
- final byte[] inconsistency = new byte[readBases.length];
- int iii;
- byte prevBase = colorSpace[0]; // The sentinel
- for (iii = 0; iii < readBases.length; iii++) {
- final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
- inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1);
- prevBase = readBases[iii];
- }
- read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency);
-
- }
- else {
- throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
- " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
- }
- }
- }
- }
-
- /**
- * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
- * This method doesn't add the inconsistent tag to the read like parseColorSpace does
- *
- * @param read The SAMRecord to parse
- * @param originalQualScores The array of original quality scores to modify during the correction
- * @param solidRecalMode Which mode of solid recalibration to apply
- * @param refBases The reference for this read
- * @return A new array of quality scores that have been ref bias corrected
- */
- public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) {
-
- final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
- if (attr != null) {
- byte[] colorSpace;
- if (attr instanceof String) {
- colorSpace = ((String) attr).getBytes();
- }
- else {
- throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
- }
-
- // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
- byte[] readBases = read.getReadBases();
- final byte[] colorImpliedBases = readBases.clone();
- byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
- if (read.getReadNegativeStrandFlag()) {
- readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
- refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone());
- }
- final int[] inconsistency = new int[readBases.length];
- byte prevBase = colorSpace[0]; // The sentinel
- for (int iii = 0; iii < readBases.length; iii++) {
- final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
- colorImpliedBases[iii] = thisBase;
- inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1);
- prevBase = readBases[iii];
- }
-
- // Now that we have the inconsistency array apply the desired correction to the inconsistent bases
- if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0
- final boolean setBaseN = false;
- originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
- }
- else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) {
- final boolean setBaseN = true;
- originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
- }
- else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
- solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead);
- }
-
- }
- else {
- throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
- " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
- }
-
- return originalQualScores;
- }
-
- public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
- if (ReadUtils.isSOLiDRead(read)) {
- final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
- if (attr != null) {
- byte[] colorSpace;
- if (attr instanceof String) {
- colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel
- }
- else {
- throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
- }
-
- for (byte color : colorSpace) {
- if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') {
- return true; // There is a bad color in this SOLiD read and the user wants to skip over it
- }
- }
-
- }
- else {
- throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
- " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
- }
- }
-
- return false; // There aren't any color no calls in this SOLiD read
- }
-
- /**
- * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
- *
- * @param read The SAMRecord to recalibrate
- * @param readBases The bases in the read which have been RC'd if necessary
- * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
- * @param originalQualScores The array of original quality scores to set to zero if needed
- * @param refBases The reference which has been RC'd if necessary
- * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
- * @return The byte array of original quality scores some of which might have been set to zero
- */
- private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
-
- final boolean negStrand = read.getReadNegativeStrandFlag();
- for (int iii = 1; iii < originalQualScores.length; iii++) {
- if (inconsistency[iii] == 1) {
- if (readBases[iii] == refBases[iii]) {
- if (negStrand) {
- originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
- }
- else {
- originalQualScores[iii] = (byte) 0;
- }
- if (setBaseN) {
- readBases[iii] = (byte) 'N';
- }
- }
- // Set the prev base to Q0 as well
- if (readBases[iii - 1] == refBases[iii - 1]) {
- if (negStrand) {
- originalQualScores[originalQualScores.length - iii] = (byte) 0;
- }
- else {
- originalQualScores[iii - 1] = (byte) 0;
- }
- if (setBaseN) {
- readBases[iii - 1] = (byte) 'N';
- }
- }
- }
- }
- if (negStrand) {
- readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
- }
- read.setReadBases(readBases);
-
- return originalQualScores;
- }
-
- /**
- * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
- *
- * @param read The SAMRecord to recalibrate
- * @param readBases The bases in the read which have been RC'd if necessary
- * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
- * @param colorImpliedBases The bases implied by the color space, RC'd if necessary
- * @param refBases The reference which has been RC'd if necessary
- */
- private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
-
- final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
- if (attr != null) {
- byte[] colorSpaceQuals;
- if (attr instanceof String) {
- String x = (String) attr;
- colorSpaceQuals = x.getBytes();
- SAMUtils.fastqToPhred(colorSpaceQuals);
- }
- else {
- throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
- }
-
- for (int iii = 1; iii < inconsistency.length - 1; iii++) {
- if (inconsistency[iii] == 1) {
- for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
- if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
- if (readBases[jjj] == refBases[jjj]) {
- if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
- final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
- if (rand == 0) { // The color implied base won the coin flip
- readBases[jjj] = colorImpliedBases[jjj];
- }
- }
- else {
- final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
- final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
- int diffInQuality = maxQuality - minQuality;
- int numLow = minQuality;
- if (numLow == 0) {
- numLow++;
- diffInQuality++;
- }
- final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
- final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
- if (rand >= numLow) { // higher q score won
- if (maxQuality == (int) colorSpaceQuals[jjj]) {
- readBases[jjj] = colorImpliedBases[jjj];
- } // else ref color had higher q score, and won out, so nothing to do here
- }
- else { // lower q score won
- if (minQuality == (int) colorSpaceQuals[jjj]) {
- readBases[jjj] = colorImpliedBases[jjj];
- } // else ref color had lower q score, and won out, so nothing to do here
- }
- }
- }
- }
- }
- }
- }
-
- if (read.getReadNegativeStrandFlag()) {
- readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
- }
- read.setReadBases(readBases);
- }
- else { // No color space quality tag in file
- throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
- }
- }
-
- /**
- * Given the base and the color calculate the next base in the sequence
- *
- * @param prevBase The base
- * @param color The color
- * @return The next base in the sequence
- */
- private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) {
- switch (color) {
- case '0':
- return prevBase;
- case '1':
- return performColorOne(prevBase);
- case '2':
- return performColorTwo(prevBase);
- case '3':
- return performColorThree(prevBase);
- default:
- throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color +
- " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias.");
- }
- }
-
- /**
- * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
- *
- * @param read The read which contains the color space to check against
- * @param offset The offset in the read at which to check
- * @return Returns true if the base was inconsistent with the color space
- */
- public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) {
- final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
- if (attr != null) {
- final byte[] inconsistency = (byte[]) attr;
- // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference!
- if (read.getReadNegativeStrandFlag()) { // Negative direction
- return inconsistency[inconsistency.length - offset - 1] != (byte) 0;
- }
- else { // Forward direction
- return inconsistency[offset] != (byte) 0;
- }
-
- // This block of code is for if you want to check both the offset and the next base for color space inconsistency
- //if( read.getReadNegativeStrandFlag() ) { // Negative direction
- // if( offset == 0 ) {
- // return inconsistency[0] != 0;
- // } else {
- // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0);
- // }
- //} else { // Forward direction
- // if( offset == inconsistency.length - 1 ) {
- // return inconsistency[inconsistency.length - 1] != 0;
- // } else {
- // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0);
- // }
- //}
-
- }
- else { // No inconsistency array, so nothing is inconsistent
- return false;
- }
- }
-
- /**
- * Computes all requested covariates for every offset in the given read
- * by calling covariate.getValues(..).
- *
- * @param gatkRead The read for which to compute covariate values.
- * @param requestedCovariates The list of requested covariates.
- * @return An array of covariate values where result[i][j] is the covariate
- * value for the ith position in the read and the jth covariate in
- * reqeustedCovariates list.
- */
- public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) {
- //compute all covariates for this read
- final int numRequestedCovariates = requestedCovariates.size();
- final int readLength = gatkRead.getReadLength();
-
- final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
- final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
-
- for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
- requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder);
- for (int j = 0; j < readLength; j++)
- covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
- }
-
- return covariateValues_offset_x_covar;
- }
-
- /**
- * Perform a certain transversion (A <-> C or G <-> T) on the base.
- *
- * @param base the base [AaCcGgTt]
- * @return the transversion of the base, or the input base if it's not one of the understood ones
- */
- private static byte performColorOne(byte base) {
- switch (base) {
- case 'A':
- case 'a':
- return 'C';
- case 'C':
- case 'c':
- return 'A';
- case 'G':
- case 'g':
- return 'T';
- case 'T':
- case 't':
- return 'G';
- default:
- return base;
- }
- }
-
- /**
- * Perform a transition (A <-> G or C <-> T) on the base.
- *
- * @param base the base [AaCcGgTt]
- * @return the transition of the base, or the input base if it's not one of the understood ones
- */
- private static byte performColorTwo(byte base) {
- switch (base) {
- case 'A':
- case 'a':
- return 'G';
- case 'C':
- case 'c':
- return 'T';
- case 'G':
- case 'g':
- return 'A';
- case 'T':
- case 't':
- return 'C';
- default:
- return base;
- }
- }
-
- /**
- * Return the complement (A <-> T or C <-> G) of a base.
- *
- * @param base the base [AaCcGgTt]
- * @return the complementary base, or the input base if it's not one of the understood ones
- */
- private static byte performColorThree(byte base) {
- switch (base) {
- case 'A':
- case 'a':
- return 'T';
- case 'C':
- case 'c':
- return 'G';
- case 'G':
- case 'g':
- return 'C';
- case 'T':
- case 't':
- return 'A';
- default:
- return base;
- }
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java
deleted file mode 100755
index aa9098549..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java
+++ /dev/null
@@ -1,118 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 3, 2009
- *
- * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
- */
-
-public class RecalDatum extends RecalDatumOptimized {
-
- private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
- private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // constructors
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public RecalDatum() {
- numObservations = 0L;
- numMismatches = 0L;
- estimatedQReported = 0.0;
- empiricalQuality = 0.0;
- }
-
- public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality ) {
- numObservations = _numObservations;
- numMismatches = _numMismatches;
- estimatedQReported = _estimatedQReported;
- empiricalQuality = _empiricalQuality;
- }
-
- public RecalDatum( final RecalDatum copy ) {
- this.numObservations = copy.numObservations;
- this.numMismatches = copy.numMismatches;
- this.estimatedQReported = copy.estimatedQReported;
- this.empiricalQuality = copy.empiricalQuality;
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // increment methods
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public final void combine( final RecalDatum other ) {
- final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
- this.increment( other.numObservations, other.numMismatches );
- this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations);
- //if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // methods to derive empirical quality score
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public final void calcCombinedEmpiricalQuality( final int smoothing, final int maxQual ) {
- this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // misc. methods
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public final double getEstimatedQReported() {
- return estimatedQReported;
- }
-
- public final double getEmpiricalQuality() {
- return empiricalQuality;
- }
-
- private double calcExpectedErrors() {
- return (double)this.numObservations * qualToErrorProb( estimatedQReported );
- }
-
- private double qualToErrorProb( final double qual ) {
- return Math.pow(10.0, qual / -10.0);
- }
-
-
- @Override
- public String toString() {
- return String.format("%d,%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality()), (byte) Math.floor(getEstimatedQReported()));
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java
deleted file mode 100755
index f04989fa5..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java
+++ /dev/null
@@ -1,135 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.QualityUtils;
-
-import java.util.List;
-
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Jan 6, 2010
- *
- * An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
- * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
- */
-
-public class RecalDatumOptimized {
-
- protected long numObservations; // number of bases seen in total
- protected long numMismatches; // number of bases seen that didn't match the reference
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // constructors
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public RecalDatumOptimized() {
- numObservations = 0L;
- numMismatches = 0L;
- }
-
- public RecalDatumOptimized( final long _numObservations, final long _numMismatches) {
- numObservations = _numObservations;
- numMismatches = _numMismatches;
- }
-
- public RecalDatumOptimized( final RecalDatumOptimized copy ) {
- this.numObservations = copy.numObservations;
- this.numMismatches = copy.numMismatches;
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // increment methods
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public synchronized final void increment( final long incObservations, final long incMismatches ) {
- numObservations += incObservations;
- numMismatches += incMismatches;
- }
-
- public synchronized final void increment( final RecalDatumOptimized other ) {
- increment( other.numObservations, other.numMismatches );
- }
-
- public synchronized final void increment( final List data ) {
- for ( RecalDatumOptimized other : data ) {
- this.increment( other );
- }
- }
-
- public synchronized final void incrementBaseCounts( final byte curBase, final byte refBase ) {
- increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // methods to derive empirical quality score
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public final double empiricalQualDouble( final int smoothing, final double maxQual ) {
- final double doubleMismatches = (double) ( numMismatches + smoothing );
- final double doubleObservations = (double) ( numObservations + smoothing );
- double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
- if (empiricalQual > maxQual) { empiricalQual = maxQual; }
- return empiricalQual;
- }
- public final double empiricalQualDouble() { return empiricalQualDouble( 0, QualityUtils.MAX_REASONABLE_Q_SCORE ); } // 'default' behavior is to use smoothing value of zero
-
- public final byte empiricalQualByte( final int smoothing ) {
- final double doubleMismatches = (double) ( numMismatches + smoothing );
- final double doubleObservations = (double) ( numObservations + smoothing );
- return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); // This is capped at Q40
- }
- public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // misc. methods
- //
- //---------------------------------------------------------------------------------------------------------------
-
- public final long getNumObservations() {
- return numObservations;
- }
-
- public final long getNumMismatches() {
- return numMismatches;
- }
-
- public final String outputToCSV( ) {
- return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
- }
- public final String outputToCSV( final int smoothing ) {
- return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte(smoothing) );
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
deleted file mode 100755
index 9752b1dee..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Hidden;
-
-/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Nov 27, 2009
- *
- * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
- * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
- */
-
-public class RecalibrationArgumentCollection {
-
- //////////////////////////////////
- // Shared Command Line Arguments
- //////////////////////////////////
- @Hidden
- @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
- public String DEFAULT_PLATFORM = null;
- @Hidden
- @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
- public String FORCE_PLATFORM = null;
- @Hidden
- @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
- public int WINDOW_SIZE = 5;
-
- /**
- * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
- * reads which have had the reference inserted because of color space inconsistencies.
- */
- @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
- public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
-
- /**
- * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles
- * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
- * their color space tag can not be recalibrated.
- */
- @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
- public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
-
- /**
- * The context covariate will use a context of this size to calculate it's covariate value
- */
- @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
- public int CONTEXT_SIZE = 8;
-
- /**
- * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
- */
- @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
- public int HOMOPOLYMER_NBACK = 7;
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
deleted file mode 100644
index 8eaa0198a..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import net.sf.samtools.*;
-import net.sf.samtools.util.SequenceUtil;
-import org.broadinstitute.sting.commandline.*;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.utils.QualityUtils;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.classloader.PluginManager;
-import org.broadinstitute.sting.utils.collections.NestedHashMap;
-import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.text.TextFormattingUtils;
-import org.broadinstitute.sting.utils.text.XReadLines;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.MissingResourceException;
-import java.util.ResourceBundle;
-import java.util.regex.Pattern;
-
-/**
- * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
- *
- *
- * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
- * base in each read this walker calculates various user-specified covariates (such as read group, reported quality score,
- * cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score
- * and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads.
- *
- *
- * See the GATK wiki for a tutorial and example recalibration accuracy plots.
- * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
- *
- *
Input
- *
- * The input read data whose base quality scores need to be recalibrated.
- *
- * The recalibration table file in CSV format that was generated by the CountCovariates walker.
- *
- *
- * Output
- *
- * A bam file in which the quality scores in each read have been recalibrated.
- *
- *
- * Examples
- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- * -R resources/Homo_sapiens_assembly18.fasta \
- * -I my_reads.bam \
- * -T TableRecalibration \
- * -o my_reads.recal.bam \
- * -recalFile my_reads.recal_data.csv
- *
- */
-
-@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
-@WalkerName("TableRecalibration")
-@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
-// This walker requires -I input.bam, it also requires -R reference.fasta
-public class TableRecalibrationWalker extends ReadWalker {
-
- public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
-
- /////////////////////////////
- // Shared Arguments
- /////////////////////////////
- @ArgumentCollection
- private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
-
- /////////////////////////////
- // Command Line Arguments
- /////////////////////////////
- /**
- * After the header, data records occur one per line until the end of the file. The first several items on a line are the
- * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
- * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
- * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
- */
- @Input(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the input covariates table recalibration .csv file")
- public File RECAL_FILE = null;
- /**
- * A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
- */
- @Output(doc = "The output recalibrated BAM file", required = true)
- private StingSAMFileWriter OUTPUT_BAM = null;
-
- /**
- * TableRacalibration accepts a --preserve_qscores_less_than / -pQ flag that instructs TableRecalibration to not modify
- * quality scores less than but rather just write them out unmodified in the recalibrated BAM file. This is useful
- * because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself,
- * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
- * your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
- * are unmodified during recalibration, so they don't get inappropriately evaluated.
- */
- @Argument(fullName = "preserve_qscores_less_than", shortName = "pQ", doc = "Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required = false)
- private int PRESERVE_QSCORES_LESS_THAN = 5;
-
- /**
- * By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical
- * quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm
- * argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
- * --smoothing 15 for a large amount of smoothing.
- */
- @Argument(fullName = "smoothing", shortName = "sm", required = false, doc = "Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
- private int SMOOTHING = 1;
-
- /**
- * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
- * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
- */
- @Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores")
- private int MAX_QUALITY_SCORE = 50;
-
- /**
- * By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
- * the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
- */
- @Argument(fullName = "doNotWriteOriginalQuals", shortName = "noOQs", required = false, doc = "If true, we will not write the original quality (OQ) tag for each read")
- private boolean DO_NOT_WRITE_OQ = false;
-
- /////////////////////////////
- // Debugging-only Arguments
- /////////////////////////////
- @Hidden
- @Argument(fullName = "no_pg_tag", shortName = "noPG", required = false, doc = "Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
- private boolean NO_PG_TAG = false;
- @Hidden
- @Argument(fullName = "fail_with_no_eof_marker", shortName = "requireEOF", required = false, doc = "If no EOF marker is present in the covariates file, exit the program with an exception.")
- private boolean REQUIRE_EOF = false;
- @Hidden
- @Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
- private boolean skipUQUpdate = false;
-
- /////////////////////////////
- // Private Member Variables
- /////////////////////////////
- private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
- private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation
- public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
- public static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
- public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
- public static final String EOF_MARKER = "EOF";
- private long numReadsWithMalformedColorSpace = 0;
-
- /////////////////////////////
- // Optimization
- /////////////////////////////
- private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // initialize
- //
- //---------------------------------------------------------------------------------------------------------------
-
- /**
- * Read in the recalibration table input file.
- * Parse the list of covariate classes used during CovariateCounterWalker.
- * Parse the CSV data and populate the hashmap.
- */
- public void initialize() {
-
- if (RAC.FORCE_PLATFORM != null) {
- RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
- }
-
- // Get a list of all available covariates
- final List> classes = new PluginManager(Covariate.class).getPlugins();
-
- int lineNumber = 0;
- boolean foundAllCovariates = false;
-
- // Read in the data from the csv file and populate the data map and covariates list
- logger.info("Reading in the data from input csv file...");
-
- boolean sawEOF = false;
- try {
- for (String line : new XReadLines(RECAL_FILE)) {
- lineNumber++;
- if (EOF_MARKER.equals(line)) {
- sawEOF = true;
- }
- else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
- ; // Skip over the comment lines, (which start with '#')
- }
- // Read in the covariates that were used from the input file
- else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
- if (foundAllCovariates) {
- throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
- }
- else { // Found the covariate list in input file, loop through all of them and instantiate them
- String[] vals = line.split(",");
- for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
- boolean foundClass = false;
- for (Class> covClass : classes) {
- if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
- foundClass = true;
- try {
- Covariate covariate = (Covariate) covClass.newInstance();
- requestedCovariates.add(covariate);
- } catch (Exception e) {
- throw new DynamicClassResolutionException(covClass, e);
- }
-
- }
- }
-
- if (!foundClass) {
- throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
- }
- }
- }
-
- }
- else { // Found a line of data
- if (!foundAllCovariates) {
- foundAllCovariates = true;
-
- // At this point all the covariates should have been found and initialized
- if (requestedCovariates.size() < 2) {
- throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
- }
-
- final boolean createCollapsedTables = true;
-
- // Initialize any covariate member variables using the shared argument collection
- for (Covariate cov : requestedCovariates) {
- cov.initialize(RAC);
- }
- // Initialize the data hashMaps
- dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size());
-
- }
- addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
- }
- }
-
- } catch (FileNotFoundException e) {
- throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
- } catch (NumberFormatException e) {
- throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
- }
- logger.info("...done!");
-
- if (!sawEOF) {
- final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
- if (REQUIRE_EOF)
- throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
- logger.warn(errorMessage);
- }
-
- logger.info("The covariates being used here: ");
- for (Covariate cov : requestedCovariates) {
- logger.info("\t" + cov.getClass().getSimpleName());
- }
-
- if (dataManager == null) {
- throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
- }
-
- // Create the tables of empirical quality scores that will be used in the sequential calculation
- logger.info("Generating tables of empirical qualities for use in sequential calculation...");
- dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE);
- logger.info("...done!");
-
- // Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
- final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
- if (!NO_PG_TAG) {
- final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
- final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
- try {
- final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
- programRecord.setProgramVersion(version);
- } catch (MissingResourceException e) {
- }
-
- StringBuffer sb = new StringBuffer();
- sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
- sb.append(" Covariates=[");
- for (Covariate cov : requestedCovariates) {
- sb.append(cov.getClass().getSimpleName());
- sb.append(", ");
- }
- sb.setCharAt(sb.length() - 2, ']');
- sb.setCharAt(sb.length() - 1, ' ');
- programRecord.setCommandLine(sb.toString());
-
- List oldRecords = header.getProgramRecords();
- List newRecords = new ArrayList(oldRecords.size() + 1);
- for (SAMProgramRecord record : oldRecords) {
- if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
- newRecords.add(record);
- }
- newRecords.add(programRecord);
- header.setProgramRecords(newRecords);
-
- // Write out the new header
- OUTPUT_BAM.writeHeader(header);
- }
- }
-
- /**
- * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
- *
- * @param line A line of CSV data read from the recalibration table data file
- */
- private void addCSVData(final File file, final String line) {
- final String[] vals = line.split(",");
-
- // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
- if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
- throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
- " --Perhaps the read group string contains a comma and isn't being parsed correctly.");
- }
-
- final Object[] key = new Object[requestedCovariates.size()];
- Covariate cov;
- int iii;
- for (iii = 0; iii < requestedCovariates.size(); iii++) {
- cov = requestedCovariates.get(iii);
- key[iii] = cov.getValue(vals[iii]);
- }
-
- // Create a new datum using the number of observations, number of mismatches, and reported quality score
- final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
- // Add that datum to all the collapsed tables which will be used in the sequential calculation
- dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN);
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // map
- //
- //---------------------------------------------------------------------------------------------------------------
-
- /**
- * For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
- *
- * @param refBases References bases over the length of the read
- * @param read The read to be recalibrated
- * @return The read with quality scores replaced
- */
- public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
-
- if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
- return read;
- }
-
- RecalDataManager.parseSAMRecord(read, RAC);
-
- byte[] originalQuals = read.getBaseQualities();
- final byte[] recalQuals = originalQuals.clone();
-
- final String platform = read.getReadGroup().getPlatform();
- if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) {
- if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) {
- final boolean badColor = RecalDataManager.checkNoCallColorSpace(read);
- if (badColor) {
- numReadsWithMalformedColorSpace++;
- if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
- return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
- }
- else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
- read.setReadFailsVendorQualityCheckFlag(true);
- return read;
- }
- }
- }
- originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases());
- }
-
- //compute all covariate values for this read
- final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates);
-
- // For each base in the read
- for (int offset = 0; offset < read.getReadLength(); offset++) {
-
- final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
-
- Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
- if (qualityScore == null) {
- qualityScore = performSequentialQualityCalculation(fullCovariateKey);
- qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
- }
-
- recalQuals[offset] = qualityScore;
- }
-
- preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
-
- read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
- if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read
- try {
- read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
- } catch (IllegalArgumentException e) {
- throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage());
- }
- }
-
- if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
- read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
- }
-
- if (RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N && refBases != null && read.getAttribute(SAMTag.NM.name()) != null) {
- read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
- }
-
- return read;
- }
-
- /**
- * Implements a serial recalibration of the reads using the combinational table.
- * First, we perform a positional recalibration, and then a subsequent dinuc correction.
- *
- * Given the full recalibration table, we perform the following preprocessing steps:
- *
- * - calculate the global quality score shift across all data [DeltaQ]
- * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
- * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
- * - The final shift equation is:
- *
- * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
- *
- * @param key The list of Comparables that were calculated from the covariates
- * @return A recalibrated quality score as a byte
- */
- private byte performSequentialQualityCalculation(final Object... key) {
-
- final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
- final Object[] readGroupCollapsedKey = new Object[1];
- final Object[] qualityScoreCollapsedKey = new Object[2];
- final Object[] covariateCollapsedKey = new Object[3];
-
- // The global quality shift (over the read group only)
- readGroupCollapsedKey[0] = key[0];
- final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
- double globalDeltaQ = 0.0;
- if (globalRecalDatum != null) {
- final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
- final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
- globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
- }
-
- // The shift in quality between reported and empirical
- qualityScoreCollapsedKey[0] = key[0];
- qualityScoreCollapsedKey[1] = key[1];
- final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
- double deltaQReported = 0.0;
- if (qReportedRecalDatum != null) {
- final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
- deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
- }
-
- // The shift in quality due to each covariate by itself in turn
- double deltaQCovariates = 0.0;
- double deltaQCovariateEmpirical;
- covariateCollapsedKey[0] = key[0];
- covariateCollapsedKey[1] = key[1];
- for (int iii = 2; iii < key.length; iii++) {
- covariateCollapsedKey[2] = key[iii]; // The given covariate
- final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
- if (covariateRecalDatum != null) {
- deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
- deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
- }
- }
-
- final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
- return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE);
-
- // Verbose printouts used to validate with old recalibrator
- //if(key.contains(null)) {
- // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
- // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
- //}
- //else {
- // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
- // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
- //}
-
- //return newQualityByte;
- }
-
- /**
- * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
- *
- * @param originalQuals The list of original base quality scores
- * @param recalQuals A list of the new recalibrated quality scores
- */
- private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
- for (int iii = 0; iii < recalQuals.length; iii++) {
- if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) {
- recalQuals[iii] = originalQuals[iii];
- }
- }
- }
-
- //---------------------------------------------------------------------------------------------------------------
- //
- // reduce
- //
- //---------------------------------------------------------------------------------------------------------------
-
- /**
- * Start the reduce with a handle to the output bam file
- *
- * @return A FileWriter pointing to a new bam file
- */
- public SAMFileWriter reduceInit() {
- return OUTPUT_BAM;
- }
-
- /**
- * Output each read to disk
- *
- * @param read The read to output
- * @param output The FileWriter to write the read to
- * @return The FileWriter
- */
- public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
- if (output != null) {
- output.addAlignment(read);
- }
- return output;
- }
-
- /**
- * Do nothing
- *
- * @param output The SAMFileWriter that outputs the bam file
- */
- public void onTraversalDone(SAMFileWriter output) {
- if (numReadsWithMalformedColorSpace != 0) {
- if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
- Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
- "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
- "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
- "These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
- }
- else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
- Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
- "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
- "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
- "These reads were completely removed from the output bam file.");
-
- }
- }
- }
-}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java
deleted file mode 100755
index 04c7caf5d..000000000
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java
+++ /dev/null
@@ -1,368 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.WalkerTest;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.testng.annotations.DataProvider;
-import org.testng.annotations.Test;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-public class RecalibrationWalkersIntegrationTest extends WalkerTest {
- static HashMap paramsFiles = new HashMap();
- static HashMap paramsFilesSolidIndels = new HashMap();
-
- private static final class CCTest extends TestDataProvider {
- String file, md5;
-
- private CCTest(final String file, final String md5) {
- super(CCTest.class);
- this.file = file;
- this.md5 = md5;
- }
-
- public String toString() {
- return "CCTest: " + file;
- }
- }
-
- @DataProvider(name = "cctestdata")
- public Object[][] createCCTestData() {
-
- new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "ab4940a16ab990181bd8368c76b23853" );
- new CCTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941");
- new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" );
- new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "932f0063abb2a23c22ec992ef8d36aa5" );
- return CCTest.getTests(CCTest.class);
- }
-
- @Test(dataProvider = "cctestdata")
- public void testCountCovariates1(CCTest test) {
- testCC(test, "");
- }
-
- @Test(dataProvider = "cctestdata")
- public void testCountCovariates4(CCTest test) {
- testCC(test, " -nt 4");
- }
-
- private final void testCC(CCTest test, String parallelism) {
- String bam = test.file;
- String md5 = test.md5;
-
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -knownSites " + b36dbSNP129 +
- " -T CountCovariates" +
- " -I " + bam +
- ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
- ? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) +
- " -cov ReadGroupCovariate" +
- " -cov QualityScoreCovariate" +
- " -cov CycleCovariate" +
- " -cov DinucCovariate" +
- " --solid_recal_mode SET_Q_ZERO" +
- " -recalFile %s" + parallelism,
- 1, // just one output file
- Arrays.asList(md5));
- List result = executeTest("testCountCovariates1" + parallelism, spec).getFirst();
- paramsFiles.put(bam, result.get(0).getAbsolutePath());
- }
-
-
- private static final class TRTest extends TestDataProvider {
- String file, md5;
-
- private TRTest(final String file, final String md5) {
- super(TRTest.class);
- this.file = file;
- this.md5 = md5;
- }
-
- public String toString() {
- return "TRTest: " + file;
- }
- }
-
- @DataProvider(name = "trtestdata")
- public Object[][] createTRTestData() {
- new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" );
- new TRTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "d04cf1f6df486e45226ebfbf93a188a5");
- new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" );
- new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "41c2f82f7789421f3690ed3c35b8f2e4" );
-
- return TRTest.getTests(TRTest.class);
- }
-
- @Test(dataProvider = "trtestdata", dependsOnMethods = "testCountCovariates1")
- public void testTableRecalibrator1(TRTest test) {
- String bam = test.file;
- String md5 = test.md5;
- String paramsFile = paramsFiles.get(bam);
- System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
- if ( paramsFile != null ) {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + b36KGReference +
- " -T TableRecalibration" +
- " -I " + bam +
- ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
- ? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) +
- " -o %s" +
- " --no_pg_tag" +
- " --solid_recal_mode SET_Q_ZERO" +
- " -recalFile " + paramsFile,
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testTableRecalibrator1", spec);
- }
- else {
- throw new IllegalStateException("testTableRecalibrator1: paramsFile was null");
- }
- }
-
- @Test
- public void testCountCovariatesUseOriginalQuals() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "0b88d0e8c97e83bdeee2064b6730abff");
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
-
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -T CountCovariates" +
- " -I " + bam +
- " -L 1:1-1,000" +
- " -standard" +
- " -OQ" +
- " -recalFile %s" +
- " -knownSites " + b36dbSNP129,
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testCountCovariatesUseOriginalQuals", spec);
- }
- }
-
- @Test(dependsOnMethods = "testCountCovariates1")
- public void testTableRecalibratorMaxQ70() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" );
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
- String paramsFile = paramsFiles.get(bam);
- System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
- if ( paramsFile != null ) {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + b36KGReference +
- " -T TableRecalibration" +
- " -I " + bam +
- ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" )
- ? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) +
- " -o %s" +
- " --no_pg_tag" +
- " -maxQ 70" +
- " --solid_recal_mode SET_Q_ZERO" +
- " -recalFile " + paramsFile,
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testTableRecalibratorMaxQ70", spec);
- }
- else {
- throw new IllegalStateException("testTableRecalibratorMaxQ70: paramsFile was null");
- }
- }
- }
-
- @Test
- public void testCountCovariatesSolidIndelsRemoveRefBias() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "8379f24cf5312587a1f92c162ecc220f" );
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
-
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -knownSites " + b36dbSNP129 +
- " -T CountCovariates" +
- " -I " + bam +
- " -standard" +
- " -U" +
- " -L 1:10,000,000-20,000,000" +
- " --solid_recal_mode REMOVE_REF_BIAS" +
- " -recalFile %s",
- 1, // just one output file
- Arrays.asList(md5));
- List result = executeTest("testCountCovariatesSolidIndelsRemoveRefBias", spec).getFirst();
- paramsFilesSolidIndels.put(bam, result.get(0).getAbsolutePath());
- }
- }
-
- @Test(dependsOnMethods = "testCountCovariatesSolidIndelsRemoveRefBias")
- public void testTableRecalibratorSolidIndelsRemoveRefBias() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" );
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
- String paramsFile = paramsFilesSolidIndels.get(bam);
- System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
- if ( paramsFile != null ) {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + b36KGReference +
- " -T TableRecalibration" +
- " -I " + bam +
- " -o %s" +
- " --no_pg_tag" +
- " -U" +
- " -L 1:10,000,000-20,000,000" +
- " --solid_recal_mode REMOVE_REF_BIAS" +
- " -recalFile " + paramsFile,
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testTableRecalibratorSolidIndelsRemoveRefBias", spec);
- }
- else {
- throw new IllegalStateException("testTableRecalibratorSolidIndelsRemoveRefBias: paramsFile was null");
- }
- }
- }
-
- @Test
- public void testCountCovariatesBED() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7e973328751d233653530245d404a64d");
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
-
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -knownSites:bed " + validationDataLocation + "recalibrationTest.bed" +
- " -T CountCovariates" +
- " -I " + bam +
- " -L 1:10,000,000-10,200,000" +
- " -standard" +
- " --solid_recal_mode SET_Q_ZERO" +
- " -recalFile %s",
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testCountCovariatesBED", spec);
- }
- }
-
- @Test
- public void testCountCovariatesVCFPlusDBsnp() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "fd9e37879069aa6d84436c25e472b9e9");
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
-
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf" +
- " -T CountCovariates" +
- " -I " + bam +
- " -knownSites " + b36dbSNP129 +
- " -L 1:10,000,000-10,200,000" +
- " -cov ReadGroupCovariate" +
- " -cov QualityScoreCovariate" +
- " -cov CycleCovariate" +
- " -cov DinucCovariate" +
- " --solid_recal_mode SET_Q_ZERO" +
- " -recalFile %s",
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testCountCovariatesVCFPlusDBsnp", spec);
- }
- }
-
- @Test
- public void testCountCovariatesNoIndex() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "aac7df368ca589dc0a66d5bd9ad007e3" );
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
-
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -knownSites " + b36dbSNP129 +
- " -T CountCovariates" +
- " -I " + bam +
- " -cov ReadGroupCovariate" +
- " -cov QualityScoreCovariate" +
- " --solid_recal_mode DO_NOTHING" +
- " -recalFile %s" +
- " -U",
- 1, // just one output file
- Arrays.asList(md5));
- List result = executeTest("testCountCovariatesNoIndex", spec).getFirst();
- paramsFiles.put(bam, result.get(0).getAbsolutePath());
- }
- }
-
- @Test(dependsOnMethods = "testCountCovariatesNoIndex")
- public void testTableRecalibratorNoIndex() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "02249d9933481052df75c58a2a1a8e63" );
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- String md5 = entry.getValue();
- String paramsFile = paramsFiles.get(bam);
- System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
- if ( paramsFile != null ) {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + b36KGReference +
- " -T TableRecalibration" +
- " -I " + bam +
- " -o %s" +
- " --no_pg_tag" +
- " --solid_recal_mode DO_NOTHING" +
- " -recalFile " + paramsFile +
- " -U",
- 1, // just one output file
- Arrays.asList(md5));
- executeTest("testTableRecalibratorNoIndex", spec);
- }
- else {
- throw new IllegalStateException("testTableRecalibratorNoIndex: paramsFile was null");
- }
- }
- }
-
- @Test
- public void testCountCovariatesFailWithoutDBSNP() {
- HashMap e = new HashMap();
- e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "");
-
- for ( Map.Entry entry : e.entrySet() ) {
- String bam = entry.getKey();
- WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
- "-R " + b36KGReference +
- " -T CountCovariates" +
- " -I " + bam +
- " -L 1:10,000,000-10,200,000" +
- " -standard" +
- " --solid_recal_mode SET_Q_ZERO" +
- " -recalFile %s",
- 1, // just one output file
- UserException.CommandLineException.class);
- executeTest("testCountCovariatesFailWithoutDBSNP", spec);
- }
- }
-
-}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersLargeScaleTest.java
deleted file mode 100755
index 3f956e3b9..000000000
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersLargeScaleTest.java
+++ /dev/null
@@ -1,80 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.recalibration;
-
-import org.broadinstitute.sting.WalkerTest;
-import org.testng.annotations.Test;
-
-import java.util.ArrayList;
-
-
-public class RecalibrationWalkersLargeScaleTest extends WalkerTest {
-
- private void testCountCovariatesWholeGenomeRunner(String moreArgs) {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + hg18Reference +
- " -T CountCovariates" +
- " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" +
- " -L chr1:1-50,000,000" +
- " -standard" +
- " -OQ" +
- " -knownSites " + GATKDataLocation + "dbsnp_132.hg18.vcf" +
- " -recalFile /dev/null" + moreArgs,
- 0,
- new ArrayList(0));
- executeTest("testCountCovariatesWholeGenome", spec);
- }
-
- private void testCountCovariatesWholeExomeRunner(String moreArgs) {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + hg18Reference +
- " -T CountCovariates" +
- " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" +
- " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" +
- " -standard" +
- " -OQ" +
- " -knownSites " + GATKDataLocation + "dbsnp_132.hg18.vcf" +
- " -recalFile /dev/null" + moreArgs,
- 0,
- new ArrayList(0));
- executeTest("testCountCovariatesWholeExome", spec);
- }
-
- @Test
- public void testCountCovariatesWholeGenome() { testCountCovariatesWholeGenomeRunner(""); }
- @Test
- public void testCountCovariatesWholeGenomeParallel() { testCountCovariatesWholeGenomeRunner(" -nt 4"); }
-
- @Test
- public void testCountCovariatesWholeExome() { testCountCovariatesWholeExomeRunner(""); }
- @Test
- public void testCountCovariatesWholeExomeParallel() { testCountCovariatesWholeExomeRunner(" -nt 4"); }
-
- @Test
- public void testTableRecalibratorWholeGenome() {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + hg18Reference +
- " -T TableRecalibration" +
- " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" +
- " -L chr1:1-50,000,000" +
- " -OQ" +
- " -recalFile " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.recal.csv" +
- " -o /dev/null",
- 0,
- new ArrayList(0));
- executeTest("testTableRecalibratorWholeGenome", spec);
- }
-
- @Test
- public void testTableRecalibratorWholeExome() {
- WalkerTestSpec spec = new WalkerTestSpec(
- "-R " + hg18Reference +
- " -T TableRecalibration" +
- " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" +
- " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" +
- " -OQ" +
- " -recalFile " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.recal.csv" +
- " -o /dev/null",
- 0,
- new ArrayList(0));
- executeTest("testTableRecalibratorWholeExome", spec);
- }
-}