From 1d46de6d340ff180735df1b5f2894c2cffa4638d Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 23 Nov 2009 14:58:33 +0000 Subject: [PATCH] The old recalibrator is replaced with the refactored recalibrator. Added a version message to the logger output. These walkers start at version 2.0.0 git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2117 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/recalibration}/Covariate.java | 2 +- .../recalibration/CovariateCounter.java | 148 ---- .../recalibration/CovariateCounterWalker.java | 681 +++++++++------ .../recalibration}/CycleCovariate.java | 2 +- .../walkers/recalibration}/Dinuc.java | 2 +- .../recalibration}/DinucCovariate.java | 2 +- .../MappingQualityCovariate.java | 2 +- .../recalibration}/MinimumNQSCovariate.java | 2 +- .../walkers/recalibration}/NHashMap.java | 2 +- .../recalibration}/PositionCovariate.java | 2 +- .../recalibration}/PrimerRoundCovariate.java | 2 +- .../recalibration}/QualityScoreCovariate.java | 2 +- .../recalibration}/ReadGroupCovariate.java | 2 +- .../walkers/recalibration}/ReadHashDatum.java | 2 +- .../gatk/walkers/recalibration/RecalData.java | 230 ----- .../recalibration/RecalDataManager.java | 315 +++---- .../walkers/recalibration}/RecalDatum.java | 2 +- .../TableRecalibrationWalker.java | 796 ++++++++---------- .../Recalibration/CovariateCounterWalker.java | 484 ----------- .../Recalibration/RecalDataManager.java | 204 ----- .../TableRecalibrationWalker.java | 454 ---------- .../recalibration/CovariateCounterTest.java | 184 ---- .../walkers/recalibration/RecalDataTest.java | 183 ---- .../RecalibrationWalkersIntegrationTest.java | 10 +- ...edRecalibrationWalkersIntegrationTest.java | 71 -- .../sting/utils/ExpandingArrayListTest.java | 1 - .../sting/utils/MergingIteratorTest.java | 1 - 27 files changed, 981 insertions(+), 2807 deletions(-) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/Covariate.java (96%) mode change 100644 => 100755 delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java mode change 100644 => 100755 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/CycleCovariate.java (98%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/Dinuc.java (94%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/DinucCovariate.java (97%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/MappingQualityCovariate.java (96%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/MinimumNQSCovariate.java (97%) mode change 100644 => 100755 rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/NHashMap.java (98%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/PositionCovariate.java (97%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/PrimerRoundCovariate.java (97%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/QualityScoreCovariate.java (96%) mode change 100644 => 100755 rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/ReadGroupCovariate.java (96%) rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/ReadHashDatum.java (90%) delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java mode change 100755 => 100644 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java rename java/src/org/broadinstitute/sting/{playground/gatk/walkers/Recalibration => gatk/walkers/recalibration}/RecalDatum.java (98%) mode change 100755 => 100644 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java delete mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java delete mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java delete mode 100755 java/test/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RefactoredRecalibrationWalkersIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java old mode 100644 new mode 100755 similarity index 96% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index b296715d6..86af70b21 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java deleted file mode 100755 index 157d2a66c..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java +++ /dev/null @@ -1,148 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.recalibration; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.BaseUtils; -import org.apache.log4j.Logger; - -import javax.management.RuntimeErrorException; -import java.util.*; - -public class CovariateCounter { - private boolean collapsePos = false; - private boolean collapseDinuc = false; - private boolean assumeFaultyHeader = false; - - private HashMap data = new HashMap(); - - protected static Logger logger = Logger.getLogger(CovariateCounter.class); - - public CovariateCounter( Set readGroups, boolean collapsePos, boolean collapseDinuc, boolean assumeFaultyHeader ) { - this.collapsePos = collapsePos; - this.collapseDinuc = collapseDinuc; - this.assumeFaultyHeader = assumeFaultyHeader; - - for (String readGroup : readGroups ) { - RecalDataManager manager = makeManager(readGroup); - data.put(readGroup, manager); - } - } - - private RecalDataManager makeManager(final String readGroup) { - return new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc ); - } - - /** - * Returns the set of readGroup names we are counting covariates for - * @return - */ - public Set getReadGroups() { - return data.keySet(); - } - - public boolean isCollapseDinuc() { - return collapseDinuc; - } - - public boolean isCollapsePos() { - return collapsePos; - } - - /** - * Returns the number of read groups being managed - * @return - */ - public int getNReadGroups() { - return data.size(); - } - - /** - * Get the particular RecalData datum associated with readGroup, at machine pos, with reported - * quality qual, and with the dinuc context of prevBase, base. If an example of such a - * base has been seen before, returns the associated RecalData. If not, it creates one, places it in the - * system so that subsequent requests will return that object, and returns it. - * - * @param readGroup - * @param pos - * @param qual - * @param prevBase - * @param base - * @return - */ - public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { - if ( ! data.containsKey(readGroup) ) { - if ( assumeFaultyHeader ) { - logger.info(String.format("Found unexpected read group, but assuming the header was bad, so extending covariates with read group %s", readGroup)); - RecalDataManager manager = makeManager(readGroup); - data.put(readGroup, manager); - } else { - throw new RuntimeException(String.format("Unexpected read group %s found, there's something wrong with your BAM file's header", readGroup)); - } - } - - - byte[] cs = {(byte)prevBase, (byte)base}; - String s = new String(cs); - return data.get(readGroup).expandingGetRecalData(pos, qual, s, true); - } - - /** - * Get a list of all of the RecalData associated with readGroup - * - * @param readGroup - * @return - */ - public List getRecalData(String readGroup) { - return data.get(readGroup).getAll(); - } - - /** - * Updates the recalibration data for the base at offset in the read, associated with readGroup rg. - * Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read - * but by implied machine cycle associated with the offset. - * - * TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based - * TODO: and offset is 0-based. How very annoying. - * - * @param rg - * @param read - * @param offset - * @param ref - * @return - */ - public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref, boolean useOriginalQuals ) { - if ( offset == 0 ) - throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName()); - - int cycle = offset; - byte[] bases = read.getReadBases(); - byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals); - - char base = (char)bases[offset]; - char prevBase = (char)bases[offset - 1]; - - if (read.getReadNegativeStrandFlag()) { - ref = BaseUtils.simpleComplement(ref); - base = BaseUtils.simpleComplement(base); - prevBase = BaseUtils.simpleComplement((char)bases[offset+1]); - cycle = read.getReadLength() - (offset + 1); - } - - int qual = quals[offset]; - if ( qual > 0 ) { - RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); - if (datum != null) datum.inc(base,ref); - return 1; - } else { - return 0; - } - } - - public void printState() { - for ( String readGroup : getReadGroups() ) { - for ( RecalData datum : getRecalData(readGroup) ) { - if ( datum.N > 0 ) - System.out.println(datum); - } - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java old mode 100644 new mode 100755 index c860993f2..2c25b4c83 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -1,292 +1,485 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RODRecordList; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.Variation; + +import java.io.PrintStream; +import java.io.FileNotFoundException; +import java.util.*; + import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.*; -import java.util.*; -import java.io.PrintStream; -import java.io.FileNotFoundException; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ -@WalkerName("CountCovariates") +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * This walker is designed to work as the first pass in a two-pass processing step. + * It does a by-locus traversal operating only at sites that are not in dbSNP. + * We assume that all mismatches we see are therefore errors. + * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) + * Since there is a large amount of data one can then calculate an empirical probability of error + * given the particular covariates seen at this site, where p(error) = num mismatches / num observations + * The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score) + * The first lines of the output file give the name of the covariate classes that were used for this calculation. + * + * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list. + * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. + */ + +@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file +@WalkerName( "CountCovariates" ) +//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests +@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta public class CovariateCounterWalker extends LocusWalker { - @Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false) - public int buggyMaxReadLen = 100000; - @Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Depreciated output file root -- now use --params to directly specify the file output name") - public String OUTPUT_FILEROOT = null; // going to blow up if specified + @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) + private Boolean LIST_ONLY = false; + @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) + private String[] COVARIATES = null; + @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) + private boolean USE_ORIGINAL_QUALS = false; + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) + private int WINDOW_SIZE = 3; + @Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file") + private String RECAL_FILE = "output.recal_data.csv"; + @Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.") + private boolean NO_PRINT_HEADER = false; + @Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.") + private boolean VALIDATE_OLD_RECALIBRATOR = false; + @Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.") + private boolean USE_SLX_PLATFORM = false; - @Argument(fullName="params", shortName="params", required=false, doc="Filename root for the outputted logistic regression training files") - public String params = "output.recal_data.csv"; - - @Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score") - public int MIN_MAPPING_QUALITY = 1; + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested + private IdentityHashMap readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes. + private int sizeOfReadDatumHashMap = 0; - @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)") - public List platforms = Collections.singletonList("*"); - //public List platforms = Collections.singletonList("ILLUMINA"); + private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file + private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file + private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file - @Argument(fullName="assumeFaultyHeader", required=false, doc="") - public boolean assumeFaultyHeader = false; + private final String versionNumber = "2.0.0"; // major version, minor version, and build number - //@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="") - public boolean collapsePos = false; - - //@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") - public boolean collapseDinuc = false; - - @Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - public boolean useOriginalQuals = false; - - - private CovariateCounter covariateCounter = null; - - private long counted_sites = 0; // number of sites used to count covariates - private long counted_bases = 0; // number of bases used to count covariates - private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry - - // THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and - // hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs, - // whether masked by overlapping indels/other events or not. - //TODO process correctly all the returned dbSNP rods at each location - - - private Pair dbSNP_counts = new Pair(0l, 0l); // mismatch/base counts for dbSNP loci - private Pair novel_counts = new Pair(0l, 0l); // mismatch/base counts for non-dbSNP loci - private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) - private static final int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) - private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- /** - * Initialize the system. Setup the data CovariateCountry for the read groups in our header + * Parse the -cov arguments and create a list of covariates to be used here + * Based on the covariates' estimates for initial capacity allocate the data hashmap */ public void initialize() { - Set readGroups = new HashSet(); - for (SAMReadGroupRecord readGroup : this.getToolkit().getSAMFileHeader().getReadGroups()) { - if( readGroup.getAttribute("PL") == null ) - Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId())); - if( !isSupportedReadGroup(readGroup) ) - continue; - readGroups.add(readGroup.getReadGroupId()); + + logger.info( "CovariateCounterWalker version: " + versionNumber ); + + // Get a list of all available covariates + final List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); + + // Print and exit if that's what was requested + if ( LIST_ONLY ) { + out.println( "Available covariates:" ); + for( Class covClass : classes ) { + out.println( covClass.getSimpleName() ); + } + out.println(); + + System.exit( 0 ); // early exit here because user requested it } // Warn the user if no dbSNP file was specified - if( this.getToolkit().getArguments().DBSNPFile == null ) { + boolean foundDBSNP = false; + for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { + if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) { + foundDBSNP = true; + } + } + if( !foundDBSNP ) { Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?"); } - covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc, assumeFaultyHeader); - logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups())); - } - // -------------------------------------------------------------------------------------------------------------- - // - // map - // - // -------------------------------------------------------------------------------------------------------------- + // Initialize the requested covariates by parsing the -cov argument + requestedCovariates = new ArrayList(); + int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one + if( VALIDATE_OLD_RECALIBRATOR ) { + requestedCovariates.add( new ReadGroupCovariate() ); + requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly + requestedCovariates.add( new QualityScoreCovariate() ); + requestedCovariates.add( new DinucCovariate() ); + } else if( COVARIATES != null ) { + if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used + requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found + requestedCovariates.add( new QualityScoreCovariate() ); + for( Class covClass : classes ) { + try { + Covariate covariate = (Covariate)covClass.newInstance(); + estimatedCapacity *= covariate.estimatedNumberOfBins(); + // Some covariates need parameters (user supplied command line arguments) passed to them + if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } + if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again + requestedCovariates.add( covariate ); + } + } catch ( InstantiationException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } + } else { // The user has specified a list of several covariates + int covNumber = 1; + for( String requestedCovariateString : COVARIATES ) { + boolean foundClass = false; + for( Class covClass : classes ) { + if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + foundClass = true; + // Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list + if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) || + (covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) { + throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" ); + } + covNumber++; + try { + // Now that we've found a matching class, try to instantiate it + Covariate covariate = (Covariate)covClass.newInstance(); + estimatedCapacity *= covariate.estimatedNumberOfBins(); + // Some covariates need parameters (user supplied command line arguments) passed to them + if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } + requestedCovariates.add( covariate ); + } catch ( InstantiationException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } + } - /** - * Walk over each read in the locus pileup and update the covariate counts based on these bases and their - * matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP. - * - * @param tracker - * @param ref - * @param context - * @return - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); - -// long testpos = 10410913 ; -// if ( ref.getLocus().getStart() == testpos ) { -// System.out.println(rods.size()+" rods:"); -// for ( ReferenceOrderedDatum d : rods.getRecords() ) System.out.println((((rodDbSNP)d).isSNP()?"SNP":"non-SNP")+" at " + d.getLocation()); -// System.exit(1); -// } - - if ( dbsnp == null ) { - // We aren't at a dbSNP position that's a SNP, so update the read - - // if ( ref.getLocus().getStart() == testpos) System.out.println("NOT A SNP INDEED"); - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i =0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - - if ( read.getReadLength() > buggyMaxReadLen ) { - throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format()); - } - - SAMReadGroupRecord readGroup = read.getReadGroup(); - - if ( readGroup == null ) { - throw new RuntimeException("No read group annotation found for read " + read.format()); - } - - if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) { - int offset = offsets.get(i); - if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count - counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref.getBase(), useOriginalQuals); + if( !foundClass ) { + throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); } } } - counted_sites += 1; - updateMismatchCounts(novel_counts, context, ref.getBase()); - } else { - // if ( ref.getLocus().getStart() == testpos) System.out.println("TREATED AS A SNP"); - // out.println(ref.getLocus()+" SNP at "+dbsnp.getLocation() ); - updateMismatchCounts(dbSNP_counts, context, ref.getBase()); - skipped_sites += 1; + } else { // No covariates were specified by the user so add the default, required ones + Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." ); + requestedCovariates.add( new ReadGroupCovariate() ); + requestedCovariates.add( new QualityScoreCovariate() ); + estimatedCapacity = 300 * 40; } - if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) { - lociSinceLastDbsnpCheck = 0; - validateDbsnpMismatchRate(); - } + logger.info( "The covariates being used here: " ); + logger.info( requestedCovariates ); - return 1; + if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception + dataManager = new RecalDataManager( estimatedCapacity ); + readDatumHashMap = new IdentityHashMap(); } - /** - * Update the mismatch / total_base counts for a given class of loci. - * - * @param counts - * @param context - * @return - */ - private static void updateMismatchCounts(Pair counts, AlignmentContext context, char ref) { - List reads = context.getReads(); - List offsets = context.getOffsets(); - for (int i =0; i < reads.size(); i++ ) { - char readChar = reads.get(i).getReadString().charAt(offsets.get(i)); - int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar); - int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref); - - if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) { - if ( readCharBaseIndex != refCharBaseIndex ) - counts.first++; - counts.second++; - } - } - } - - /** - * Validate the dbSNP mismatch rates. - * - * @return - */ - private void validateDbsnpMismatchRate() { - if ( novel_counts.second == 0 || dbSNP_counts.second == 0 ) - return; - - double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second; - double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second; - //System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel)); - - if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) - Utils.warnUser("The variation rate of dbSNP sites seems suspicious! Please double-check that the correct DBSNP ROD is being used."); - } - - /** - * Check to see whether this read group should be processed. Returns true if the - * read group is in the list of platforms to process or the platform == *, indicating - * that all platforms should be processed. - * - * @param readGroup - * @return - */ - private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) { - for( String platform: platforms ) { - platform = platform.trim(); - if( platform.equals("*") || readGroup == null || - readGroup.getAttribute("PL") == null || - readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) ) - return true; - } - - return false; - } - - // -------------------------------------------------------------------------------------------------------------- + //--------------------------------------------------------------------------------------------------------------- // - // Reduce + // map // - // -------------------------------------------------------------------------------------------------------------- + //--------------------------------------------------------------------------------------------------------------- /** - * Provide an initial value for reduce computations. - * @return Initial value of reduce. + * For each read at this locus get the various covariate values and increment that location in the map based on + * whether or not the base matches the reference at this particular location + * @param tracker The reference metadata tracker + * @param ref The reference context + * @param context The alignment context + * @return Returns 1, but this value isn't used in the reduce step */ - public PrintStream reduceInit() { - try { - if ( OUTPUT_FILEROOT != null ) - throw new RuntimeException("OUTPUT_FILEROOT argument has been removed, please use --params from now on to directly specify the output parameter filename"); - return new PrintStream( params ); - } catch ( FileNotFoundException e ) { - throw new RuntimeException("Couldn't open output file", e); - } - } + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - public void onTraversalDone(PrintStream recalTableStream) { - out.printf("Writing raw recalibration data..."); - writeRecalTable(recalTableStream); - out.printf("...done%n"); - - //out.printf("Writing logistic recalibration data%n"); - //writeLogisticRecalibrationTable(); - //out.printf("...done%n"); - - recalTableStream.close(); - - } - - /** - * Prints some basic information about the CountCovariates run to the output stream out - * @param out - */ - private void printInfo(PrintStream out) { - //out.printf("# date \"%s\"%n", new Date()); - out.printf("# collapsed_pos %b%n", collapsePos); - out.printf("# collapsed_dinuc %b%n", collapseDinuc); - out.printf("# counted_sites %d%n", counted_sites); - out.printf("# counted_bases %d%n", counted_bases); - out.printf("# skipped_sites %d%n", skipped_sites); - out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites); - } - - /** - * Writes out the key recalibration data collected from the reads. Dumps this recalibration data - * as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file. - */ - private void writeRecalTable(PrintStream recalTableStream) { - printInfo(recalTableStream); - - recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp"); - for (String readGroup : new TreeSet(covariateCounter.getReadGroups()) ) { - for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) { - if ( datum.N > 0 ) { - recalTableStream.println(datum.toCSVString(collapsePos)); + // pull out anything passed by -B name,type,file that has the name "dbsnp" + final RODRecordList dbsnpRODs = tracker.getTrackData( "dbsnp", null ); + boolean isSNP = false; + if (dbsnpRODs != null) { + for( ReferenceOrderedDatum rod : dbsnpRODs ) { + if( ((Variation)rod).isSNP() ) { + isSNP = true; // at least one of the rods says this is a snp site + break; } } } + + // Only use data from non-dbsnp sites + // Assume every mismatch at a non-dbsnp site is indicitive of poor quality + if( !isSNP ) { + final List reads = context.getReads(); + final List offsets = context.getOffsets(); + SAMRecord read; + int offset; + String readGroupId; + byte[] quals; + byte[] bases; + byte refBase; + byte prevBase; + String platform; + byte[] colorSpaceQuals; + ReadHashDatum readDatum; + boolean isNegStrand; + int mappingQuality; + int length; + + final int numReads = reads.size(); + // For each read at this locus + for( int iii = 0; iii < numReads; iii++ ) { + read = reads.get(iii); + offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct + + readDatum = readDatumHashMap.get( read ); + if( readDatum == null ) { + + // If the HashMap of read objects has grown too large then throw out the (mostly stale) reads + if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger? + readDatumHashMap.clear(); + sizeOfReadDatumHashMap = 0; + } + + // This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again + quals = read.getBaseQualities(); + // Check if we need to use the original quality scores instead + if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + quals = QualityUtils.fastqToPhred((String)obj); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + } + bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true? + isNegStrand = read.getReadNegativeStrandFlag(); + final SAMReadGroupRecord readGroup = read.getReadGroup(); + readGroupId = readGroup.getReadGroupId(); + platform = readGroup.getPlatform(); + mappingQuality = read.getMappingQuality(); + length = bases.length; + if( USE_SLX_PLATFORM ) { + platform = "ILLUMINA"; + } + + readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length ); + readDatumHashMap.put( read, readDatum ); + sizeOfReadDatumHashMap++; + } + + if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests + + // skip first and last base because there is no dinuc + // BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests. + if( offset > 0 ) { + if( offset < readDatum.length - 1 ) { + // skip if base quality is zero + if( readDatum.quals[offset] > 0 ) { + + refBase = (byte)ref.getBase(); + prevBase = readDatum.bases[offset-1]; + + // Get the complement base strand if we are a negative strand read + if( readDatum.isNegStrand ) { + prevBase = readDatum.bases[offset+1]; + } + + // skip if this base or the previous one was an 'N' or etc. + if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) { + + // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them + colorSpaceQuals = null; + if( readDatum.platform.equalsIgnoreCase("SOLID") ) { + colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + } + if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet + { + // This base finally passed all the checks, so add it to the big hashmap + updateDataFromRead( readDatum, offset, refBase ); + } + } else { + if( VALIDATE_OLD_RECALIBRATOR ) { + countedBases++; // replicating a small bug in the old recalibrator + } + } + } + } else { // at the last base in the read so we can remove it from our IdentityHashMap + readDatumHashMap.remove( read ); + sizeOfReadDatumHashMap--; + } + } + } + } + countedSites++; + + } else { // We skipped over the dbSNP site + skippedSites++; + } + + return 1; // This value isn't actually used anywhere } /** - * Doesn't do anything - * - * @param empty - * @param recalTableStream - * @return + * Major workhorse routine for this walker. + * Loop through the list of requested covariates and pick out the value from the read, offset, and reference + * Using the list of covariate values as a key, pick out the RecalDatum and increment, + * adding one to the number of observations and potentially one to the number of mismatches + * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls + * because pulling things out of the SAMRecord is an expensive operation. + * @param readDatum The ReadHashDatum holding all the important properties of this read + * @param offset The offset in the read for this locus + * @param refBase The reference base at this locus */ - public PrintStream reduce(Integer empty, PrintStream recalTableStream) { - return recalTableStream; + private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) { + + List key = new ArrayList(); + + // Loop through the list of requested covariates and pick out the value from the read, offset, and reference + for( Covariate covariate : requestedCovariates ) { + key.add( covariate.getValue( readDatum, offset ) ); + } + + // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + RecalDatum datum = dataManager.data.get( key ); + if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it + datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method + if( VALIDATE_OLD_RECALIBRATOR ) { + dataManager.data.myPut( key, datum ); + } else { + dataManager.data.put( key, datum ); + } + } + + // Need the bases to determine whether or not we have a mismatch + byte base = readDatum.bases[offset]; + + // Add one to the number of observations and potentially one to the number of mismatches + datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad + countedBases++; + } + + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker. + * @return returns A PrintStream created from the -rf filename + */ + public PrintStream reduceInit() { + try { + return new PrintStream( RECAL_FILE ); + } catch ( FileNotFoundException e ) { + throw new RuntimeException( "Couldn't open output file: ", e ); + } + } + + /** + * The Reduce method doesn't do anything for this walker. + * @param value Result of the map. This value is immediately ignored. + * @param recalTableStream The PrintStream used to output the CSV data + * @return returns The PrintStream used to output the CSV data + */ + public PrintStream reduce( Integer value, PrintStream recalTableStream ) { + return recalTableStream; // nothing to do here + } + + /** + * Write out the full data hashmap to disk in CSV format + * @param recalTableStream The PrintStream to write out to + */ + public void onTraversalDone( PrintStream recalTableStream ) { + logger.info( "Writing raw recalibration data..." ); + outputToCSV( recalTableStream ); + logger.info( "...done!" ); + + recalTableStream.close(); + } + + /** + * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format + * @param recalTableStream The PrintStream to write out to + */ + private void outputToCSV( final PrintStream recalTableStream ) { + + if( VALIDATE_OLD_RECALIBRATOR ) { + recalTableStream.printf("# collapsed_pos false%n"); + recalTableStream.printf("# collapsed_dinuc false%n"); + recalTableStream.printf("# counted_sites %d%n", countedSites); + recalTableStream.printf("# counted_bases %d%n", countedBases); + recalTableStream.printf("# skipped_sites %d%n", skippedSites); + recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n"); + // For each entry in the data hashmap + for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { + // For each Covariate in the key + for( Comparable comp : entry.first ) { + // Output the Covariate's value + recalTableStream.print( comp + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( entry.second.outputToCSV() ); + } + } else { + if( !NO_PRINT_HEADER ) { + recalTableStream.printf("# Counted Sites %d%n", countedSites); + recalTableStream.printf("# Counted Bases %d%n", countedBases); + recalTableStream.printf("# Skipped Sites %d%n", skippedSites); + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + for( Covariate cov : requestedCovariates ) { + // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name + recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); + } + } + // For each entry in the data hashmap + for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { + // For each Covariate in the key + for( Comparable comp : entry.getKey() ) { + // Output the Covariate's value + if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG + recalTableStream.print( comp + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( entry.getValue().outputToCSV() ); + } + } } } + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java similarity index 98% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 0438b8bd0..1e9f47e49 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Dinuc.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java similarity index 94% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Dinuc.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java index 7b07a0ec2..ace61c6ff 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Dinuc.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; /** * Created by IntelliJ IDEA. diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index acb6a4128..d9f3c5557 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java similarity index 96% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index 1e0d0a9ff..035709262 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java old mode 100644 new mode 100755 similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index ca44b2609..13bf5750c --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java similarity index 98% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java index 2016c95ca..5580c8ab8 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.StingException; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PositionCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PositionCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 72002947a..451770af5 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PositionCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.Utils; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 8822415c9..a1e033e03 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.StingException; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java old mode 100644 new mode 100755 similarity index 96% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index 699395b45..894688885 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java similarity index 96% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index b22520e37..15cec3c1e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadHashDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java similarity index 90% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadHashDatum.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java index d2a2fde6c..308b09b99 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadHashDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; /** * Created by IntelliJ IDEA. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java deleted file mode 100755 index dabf68b83..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java +++ /dev/null @@ -1,230 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.recalibration; - -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.BaseUtils; - -import java.util.List; -import java.util.Collections; - -public class RecalData implements Comparable { - // - // context of this data -- read group, position in read, reported quality, and dinuc - // - String readGroup; // the read group where the datum was observed - int pos; // the position of this recalibration datum - int qual; // the reported quality scoree of this datum - String dinuc; // the dinucleotide context of this datum - - // - // empirical quality score data - // - long N; // total number of observed bases - long B; // number of observed mismatches to the reference base - - - public RecalData(int pos, int qual, String readGroup, String dinuc ) { - this.pos = pos; - this.qual = qual; - this.readGroup = readGroup; - this.dinuc = dinuc; - } - - public RecalData( RecalData copy ) { - this.pos = copy.pos; - this.qual = copy.qual; - this.readGroup = copy.readGroup; - this.dinuc = copy.dinuc; - this.N = copy.N; - this.B = copy.B; - } - - public int compareTo(RecalData to) { - int cmpReadGroup = readGroup.compareTo(to.readGroup); - if (cmpReadGroup != 0) return cmpReadGroup; - - int cmpPos = new Integer(pos).compareTo(to.pos); - if (cmpPos != 0) return cmpPos; - - int cmpQual = new Integer(qual).compareTo(to.qual); - if (cmpQual != 0) return cmpQual; - - return dinuc.compareTo(to.dinuc); - } - - /** - * Returns the expected number of mismatching bases for this datum, using the number of bases N and - * the reported qual score. Effectively qual implied error rate * number of bases. - * - * @return - */ - public double getExpectedErrors() { - return QualityUtils.qualToErrorProb((byte)qual) * N; - } - - // - // Increment routines - // - public RecalData inc(long incN, long incB) { - N += incN; - B += incB; - return this; - } - - /** - * Add N and B from other to self - * - * @param other - * @return - */ - public RecalData inc(final RecalData other) { - return inc(other.N, other.B); - } - - /** - * Add all of the Ns and Bs for data to self - * - * @param data - * @return - */ - public RecalData inc(List data) { - for ( RecalData other : data ) { - this.inc(other); - } - return this; - } - - /** - * Count an empirical observation of our current base against the reference base. - * Increments N and, if curBase != ref, also increments B - * - * @param curBase - * @param ref - * @return - */ - public RecalData inc(char curBase, char ref) { - return inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1); - //out.printf("%s %s\n", curBase, ref); - } - - - /** - * Get the integer dinuc index associated with this datum. Can return -1 if dinuc - * contains unexpected bases. Returns 0 if dinuc contains the * operator (not tracking dinucs) - * @return - */ - public int getDinucIndex() { - return dinucIndex(this.dinuc); - } - - /** - * Calculates the empirical quality score for this datum from B and N. Bounds the result - * by QualityUtils.MAX_REASONABLE_Q_SCORE. - * - * @return - */ - public double empiricalQualDouble(final boolean useRawQempirical) { - double doubleB = useRawQempirical ? B : B + 1; - double doubleN = useRawQempirical ? N : N + 1; - double empiricalQual = -10 * Math.log10(doubleB / doubleN); - if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; - return empiricalQual; - } - public double empiricalQualDouble() { return empiricalQualDouble(true); } - - - /** - * As as empiricalQualDouble, but rounded and encoded as a byte for placement in a read quality array of bytes - * @return - */ - public byte empiricalQualByte(final boolean useRawQempirical) { - double doubleB = useRawQempirical ? B : B + 1; - double doubleN = useRawQempirical ? N : N + 1; - return QualityUtils.probToQual(1.0 - doubleB / doubleN); - } - public byte empiricalQualByte() { return empiricalQualByte(true); } - - - public static String headerString() { - return ("pos, rg, dinuc, qual, emp_qual, qual_diff, n, b"); - } - - public String toString() { - return String.format("[rg=%s, pos=%3d, Qrep=%3d, dinuc=%s, Qemp=%2.2f / %2.2f, B=%d, N=%d]", readGroup, pos, qual, dinuc, empiricalQualDouble(), empiricalQualDouble(false), B, N); - //return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); - } - - // - // To and from CSV strings - // - public String toCSVString(boolean collapsedPos) { - // rg,pos,Qrep,dn,nBases,nMismatches,Qemp - return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, collapsedPos ? "*" : pos, qual, dinuc, N, B, empiricalQualByte()); - } - - /** - * Parses the string s and returns the encoded RecalData in it. Assumes data has been emitted by - * toCSVString() routine. Order is readGroup, dinuc, qual, pos, N, B, Qemp as byte - * @param s - * @return - */ - public static RecalData fromCSVString(String s) throws NumberFormatException { - String[] vals = s.split(","); - String rg = vals[0]; - int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]); - int qual = Integer.parseInt(vals[2]); - String dinuc = vals[3]; - int N = Integer.parseInt(vals[4]); - int B = Integer.parseInt(vals[5]); - RecalData datum = new RecalData(pos, qual, rg, dinuc); - datum.B = B; - datum.N = N; - - // Checking for badness - if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos); - if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B); - if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N); - if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual); - - return datum; - } - - - public static List sort(List l) { - Collections.sort(l); - return l; - } - - public static int bases2dinucIndex(char prevBase, char base) { - int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase); - int bI = BaseUtils.simpleBaseToBaseIndex(base); - return (pbI == -1 || bI == -1) ? -1 : pbI * 4 + bI; - } - - public final static int NDINUCS = 16; - public static String dinucIndex2bases(int index) { - char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)}; - return new String( data ); - } - - public static int dinucIndex(String s) { - return bases2dinucIndex(s.charAt(0), s.charAt(1)); - } - - public static int dinucIndex(byte prevBase, byte base) { - return bases2dinucIndex((char)prevBase, (char)base); - } - - public static double combinedQreported(List l) { - double sumExpectedErrors = 0; - long nBases = 0; - for ( RecalData datum : l ) { - sumExpectedErrors += datum.getExpectedErrors(); - nBases += datum.N; - } - - double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases); - //System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n", - // sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q); - return q; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java old mode 100755 new mode 100644 index c976ef4c8..5ca812ff9 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -1,195 +1,204 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.utils.ExpandingArrayList; import org.broadinstitute.sting.utils.QualityUtils; import java.util.*; -import net.sf.samtools.SAMRecord; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ /** * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Jun 16, 2009 - * Time: 9:55:10 PM - * To change this template use File | Settings | File Templates. + * User: rpoplin + * Date: Nov 6, 2009 + * + * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. */ + public class RecalDataManager { - /** The original quality scores are stored in this attribute */ - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; + public NHashMap data; // the full dataset + private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed + public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed - ArrayList flattenData = new ArrayList(); - boolean trackPos, trackDinuc; - String readGroup; - int nDinucs, maxReadLen; + private NHashMap dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed - //RecalData[][][] data = null; - ExpandingArrayList>> data = null; - public RecalDataManager(String readGroup, - //int maxReadLen, int maxQual, int nDinucs, - boolean trackPos, boolean trackDinuc) { - data = new ExpandingArrayList>>(); - //data = new RecalData[maxReadLen+1][maxQual+1][nDinucs]; + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams - this.readGroup = readGroup; - this.trackPos = trackPos; - this.trackDinuc = trackDinuc; - //this.maxReadLen = maxReadLen; - //this.nDinucs = nDinucs; + RecalDataManager() { + data = new NHashMap(); } - public int getMaxReadLen() { - return data.size(); - } - - public int getPosIndex(int pos) { - return trackPos ? pos : 0; - } - - public int canonicalPos(int cycle) { - return getPosIndex(cycle); - } - - - public int getDinucIndex(String dinuc) { - return trackDinuc ? RecalData.dinucIndex(dinuc) : 0; - } - - public int getDinucIndex(byte prevBase, byte base) { - return trackDinuc ? RecalData.dinucIndex(prevBase, base) : 0; - } - - public String canonicalDinuc(String dinuc) { - return trackDinuc ? dinuc : "**"; - } - - public void addDatum(RecalData datum) { - if ( ! datum.readGroup.equals(this.readGroup) ) { - throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup)); - } - - RecalData prev = getRecalData(datum.pos, datum.qual, datum.dinuc); - if ( prev != null ) { - if ( trackDinuc && trackPos ) - throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum)); - prev.inc(datum); + RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) { + if( createCollapsedTables ) { // initialize all the collapsed tables + dataCollapsedReadGroup = new NHashMap(); + dataCollapsedQualityScore = new NHashMap(); + dataCollapsedByCovariate = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.add( new NHashMap() ); + } + dataSumExpectedErrors = new NHashMap(); } else { - int posIndex = getPosIndex(datum.pos); - int internalDinucIndex = getDinucIndex(datum.dinuc); - if ( internalDinucIndex == -1 ) return; - - set(posIndex, datum.qual, internalDinucIndex, datum); - flattenData.add(datum); - } + data = new NHashMap( estimatedCapacity, 0.8f); + } } - public RecalData getRecalData(int pos, int qual, String dinuc) { - return expandingGetRecalData(pos, qual, dinuc, false); - } - - public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) { - int posIndex = getPosIndex(pos); - int internalDinucIndex = getDinucIndex(dinuc); - - if ( internalDinucIndex == -1 ) return null; - - RecalData datum = get(posIndex, qual, internalDinucIndex); - if ( datum == null && expandP ) { - //System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index); - datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**"); - set(posIndex, qual, internalDinucIndex, datum); - flattenData.add(datum); - } - - return datum; + RecalDataManager( final int estimatedCapacity ) { + data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', + // a number to monkey around with when optimizing performace of the HashMap } /** - * Returns the RecalData associated with pos, qual, dinuc, or null if not is bound. - * Does not expand the system to corporate a new data point - * @param posIndex - * @param qual - * @param dinucIndex - * @return + * Add the given mapping to all of the collapsed hash tables + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping */ - private RecalData get(int posIndex, int qual, int dinucIndex) { - ExpandingArrayList> d2 = data.get(posIndex); - if (d2 == null) return null; - ExpandingArrayList d3 = d2.get(qual); - return d3 == null ? null : d3.get(dinucIndex); - } + public final void addToAllTables( final List key, final RecalDatum fullDatum ) { - private void set(int posIndex, int qual, int dinucIndex, RecalData datum) { - // grow data if necessary - ExpandingArrayList> d2 = data.get(posIndex); - if (d2 == null) { - d2 = new ExpandingArrayList>(); - data.set(posIndex, d2); + // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around + //data.put(key, thisDatum); // add the mapping to the main table + + // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + ArrayList newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with just the read group + RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); } - // Expand d2 if necessary - ExpandingArrayList d3 = d2.get(qual); - if ( d3 == null ) { - d3 = new ExpandingArrayList(); - d2.set(qual, d3); + // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed + newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with just the read group + Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); + if( sumExpectedErrors == null ) { + dataSumExpectedErrors.put( newKey, 0.0 ); + } else { + dataSumExpectedErrors.remove( newKey ); + sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations(); + dataSumExpectedErrors.put( newKey, sumExpectedErrors ); } - // set d3 to datum - d3.set(dinucIndex, datum); - } + newKey = new ArrayList(); + // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + newKey.add( key.get(0) ); // make a new key with the read group ... + newKey.add( key.get(1) ); // and quality score + collapsedDatum = dataCollapsedQualityScore.get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); + } - private RecalData getMatchingDatum(List l, RecalData datum, - boolean combinePos, boolean combineQual, boolean combineDinuc) { - for ( RecalData find : l ) { - if ( (combineQual || find.qual == datum.qual) && - (combinePos || find.pos == datum.pos ) && - (combineDinuc || find.dinuc.equals(datum.dinuc)) ) { - //System.out.printf("Found %s for %s%n", find, datum); - return find; + // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted + newKey = new ArrayList(); + newKey.add( key.get(0) ); // make a new key with the read group ... + newKey.add( key.get(1) ); // and quality score ... + newKey.add( key.get(iii + 2) ); // and the given covariate + collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment(fullDatum); } } - RecalData d = new RecalData(combinePos ? 0 : datum.pos, combineQual ? 0 : datum.qual, datum.readGroup, combineDinuc ? "**" : datum.dinuc ); - //System.out.printf("Making new match %s%n", d); - l.add(d); - return d; } - public List combineDinucs() { - return combine(false, false, true); - } + /** + * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score + * that will be used in the sequential calculation in TableRecalibrationWalker + * @param numCovariates The number of covariates you have determines the number of tables to create + * @param smoothing The smoothing paramter that goes into empirical quality score calculation + */ + public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) { - public List combineCycles() { - return combine(true, false, false); - } - - public List combine(boolean ignorePos, boolean ignoreQual, boolean ignoreDinuc) { - List l = new ArrayList(); - for ( RecalData datum : flattenData ) { - RecalData reduced = getMatchingDatum(l, datum, ignorePos, ignoreQual, ignoreDinuc); - //System.out.printf("Combining %s with %s%n", datum, reduced); - reduced.inc(datum); + dataCollapsedReadGroupDouble = new NHashMap(); + dataCollapsedQualityScoreDouble = new NHashMap(); + dataCollapsedByCovariateDouble = new ArrayList>(); + for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariateDouble.add( new NHashMap() ); } - return l; - } - public List getAll() { - return flattenData; - } - - // we can process the OQ field if requested - public static byte[] getQualsForRecalibration( SAMRecord read, boolean useOriginalQuals ) { - byte[] quals = read.getBaseQualities(); - if ( useOriginalQuals && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + // Hash the empirical quality scores so we don't have to call Math.log at every base for every read + // Looping over the entrySet is really expensive but worth it + for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { + dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + } + for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { + dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + } + for( int iii = 0; iii < numCovariates - 2; iii++ ) { + for( Map.Entry,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) { + dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); } } - return quals; + dataCollapsedQualityScore.clear(); + dataCollapsedByCovariate.clear(); + dataCollapsedQualityScore = null; // will never need this again + dataCollapsedByCovariate = null; // will never need this again + if( data!=null ) { + data.clear(); + data = null; // will never need this again + } + } + + /** + * Get the appropriate collapsed table out of the set of all the tables held by this Object + * @param covariate Which covariate indexes the desired collapsed HashMap + * @return The desired collapsed HashMap + */ + public final NHashMap getCollapsedTable( final int covariate ) { + if( covariate == 0) { + return dataCollapsedReadGroup; // table where everything except read group has been collapsed + } else if( covariate == 1 ) { + return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed + } else { + return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed + } + } + + /** + * Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object + * @param covariate Which covariate indexes the desired collapsed NHashMap + * @return The desired collapsed NHashMap + */ + public final NHashMap getCollapsedDoubleTable( final int covariate ) { + if( covariate == 0) { + return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + } else if( covariate == 1 ) { + return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + } else { + return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + } } } - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java similarity index 98% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java rename to java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java index da07c331e..0a73a6a48 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.QualityUtils; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java old mode 100755 new mode 100644 index 1d7255669..528440ad4 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -1,3 +1,22 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; +import java.io.File; +import java.io.FileNotFoundException; + /* * Copyright (c) 2009 The Broad Institute * @@ -23,311 +42,408 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.recalibration; +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; + * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) + * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. + * This walker then outputs a new bam file with these updated (recalibrated) reads. + * + * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. + * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. + */ @WalkerName("TableRecalibration") -@Requires({DataSource.READS}) // , DataSource.REFERENCE}) +@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the + // GATK to not actually spend time giving me the refBase array since I don't use it public class TableRecalibrationWalker extends ReadWalker { - @Argument(fullName="params", shortName="params", doc="CountCovariates params file", required=true) - public String paramsFile; - @Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=true) - public SAMFileWriter outputBam = null; + @Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true) + private String RECAL_FILE = null; + @Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false) + private SAMFileWriter OUTPUT_BAM = null; + @Argument(fullName="preserve_qscores_less_than", shortName="pQ", + doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) + private int PRESERVE_QSCORES_LESS_THAN = 5; + @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) + private boolean USE_ORIGINAL_QUALS = false; + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) + private int WINDOW_SIZE = 3; + @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") + private int SMOOTHING = 1; + @Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.") + private boolean VALIDATE_OLD_RECALIBRATOR = false; + @Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.") + private boolean USE_SLX_PLATFORM = false; - @Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false) - public boolean rawQempirical = false; + //public enum RecalibrationMode { + // COMBINATORIAL, + // SEQUENTIAL, + // ERROR + //} - @Argument(fullName="preserveQScoresLessThan", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) - public int preserveQScoresLessThan = 5; + //@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false) + private String MODE_STRING = "SEQUENTIAL"; + //public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes? - @Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - public boolean useOriginalQuals = false; - - // - // Basic static information - // - private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class); - private static String VERSION = "0.2.5"; - - // maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]] - HashMap cache = new HashMap(); - - public enum RecalibrationMode { - COMBINATORIAL, - JAFFE, - BY_POS_ONLY, - BY_DINUC_ONLY, - SEQUENTIAL, - ERROR - } - - @Argument(shortName="mode", doc="", required=false) - public String modeString = RecalibrationMode.SEQUENTIAL.toString(); - public RecalibrationMode mode = RecalibrationMode.ERROR; + private RecalDataManager dataManager; + private ArrayList requestedCovariates; + private ArrayList fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values + private ArrayList collapsedTableKey; // the key that will be used over and over again to query the collapsed tables private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)"); - private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)"); - private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*"); + private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); + private final String versionNumber = "2.0.0"; // major version, minor version, and build number + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Read in the recalibration table input file. + * Parse the list of covariate classes used during CovariateCounterWalker. + * Parse the CSV data and populate the hashmap. + */ public void initialize() { - logger.info("TableRecalibrator version: " + VERSION); - - // - // crappy hack until Enum arg types are supported - // - for ( RecalibrationMode potential : RecalibrationMode.values() ) { - if ( potential.toString().equals(modeString)) { - mode = potential; - break; - } - } - if ( mode == RecalibrationMode.ERROR ) - throw new RuntimeException("Unknown mode requested: " + modeString); + logger.info( "TableRecalibrationWalker version: " + versionNumber ); + + // Get a list of all available covariates + List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); - // - // Walk over the data file, parsing it into recalibration data - // int lineNumber = 0; - try { - logger.info(String.format("Reading data...")); - List data = new ArrayList(); - boolean collapsedPos = false; - boolean collapsedDinuc = false; + boolean foundAllCovariates = false; + int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one - //List lines = new xReadLines(new File(paramsFile)).readLines(); - for ( String line : new xReadLines(new File(paramsFile)) ) { + // Warn the user if a dbSNP file was specified since it isn't being used here + boolean foundDBSNP = false; + for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { + if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) { + foundDBSNP = true; + } + } + if( foundDBSNP ) { + Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it."); + } + + fullCovariateKey = new ArrayList(); + collapsedTableKey = new ArrayList(); + + // Read in the covariates that were used from the input file + requestedCovariates = new ArrayList(); + + // Read in the data from the csv file and populate the map + logger.info( "Reading in the data from input file..." ); + + try { + for ( String line : new xReadLines(new File( RECAL_FILE )) ) { lineNumber++; - if ( HEADER_PATTERN.matcher(line).matches() ) - continue; - if ( COMMENT_PATTERN.matcher(line).matches() ) { - collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos); - collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc); + if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) { + ; // skip over the comment lines, (which start with '#') } - else { - data.add(RecalData.fromCSVString(line)); + else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data + if( foundAllCovariates ) { + throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE ); + } else { // found another covariate in input file + boolean foundClass = false; + for( Class covClass : classes ) { + + if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names + foundClass = true; + try { + Covariate covariate = (Covariate)covClass.newInstance(); + // some covariates need parameters (user supplied command line arguments) passed to them + if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } + requestedCovariates.add( covariate ); + estimatedCapacity *= covariate.estimatedNumberOfBins(); + + } catch ( InstantiationException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } + } + + if( !foundClass ) { + throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." ); + } + + } + + } else { // found some data + if( !foundAllCovariates ) { + if( VALIDATE_OLD_RECALIBRATOR ) { + requestedCovariates.add( new ReadGroupCovariate() ); + requestedCovariates.add( new QualityScoreCovariate() ); + requestedCovariates.add( new CycleCovariate() ); + requestedCovariates.add( new DinucCovariate() ); + } + foundAllCovariates = true; + if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception + final boolean createCollapsedTables = true; + // Initialize the data hashMaps + dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() ); + + } + addCSVData(line); // parse the line and add the data to the HashMap } } - initializeCache(data, collapsedPos, collapsedDinuc); + } catch ( FileNotFoundException e ) { - Utils.scareUser("Cannot read/find parameters file " + paramsFile); + Utils.scareUser("Can not find input file: " + RECAL_FILE); } catch ( NumberFormatException e ) { - throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e); + throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); } - } + logger.info( "...done!" ); - private boolean parseCommentLine(Pattern pat, String line, boolean flag) { - Matcher m = pat.matcher(line); - if ( m.matches() ) { - flag = Boolean.parseBoolean(m.group(1)); - } + logger.info( "The covariates being used here: " ); + logger.info( requestedCovariates ); - return flag; - } - - private void initializeCache(List data, boolean collapsedPos, boolean collapsedDinuc ) { - Set readGroups = new HashSet(); - Set dinucs = new HashSet(); - int maxPos = -1; - int maxQReported = -1; - - logger.info(String.format("No. params : %d", data.size())); - // get the maximum data from the file - for ( RecalData datum : data ) { - readGroups.add(datum.readGroup); - dinucs.add(datum.dinuc); - maxPos = Math.max(maxPos, datum.pos); - maxQReported = Math.max(maxQReported, datum.qual); - } - logger.info(String.format("Read groups : %d %s", readGroups.size(), readGroups.toString())); - logger.info(String.format("Dinucs : %d %s", dinucs.size(), dinucs.toString())); - logger.info(String.format("Max pos : %d", maxPos)); - logger.info(String.format("Max Q reported : %d", maxQReported)); - - // check the mode - switch ( mode ) { - case JAFFE: - collapsedPos = collapsedDinuc = true; - break; - case BY_POS_ONLY: - if ( collapsedPos ) - throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc)); - collapsedPos = true; - throw new RuntimeException("Unsupported mode requested, sorry"); - //break; - case BY_DINUC_ONLY: - if ( collapsedDinuc ) - throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc)); - collapsedDinuc = true; - throw new RuntimeException("Unsupported mode requested, sorry"); - //break; - case COMBINATORIAL: - if ( collapsedPos || collapsedDinuc ) - throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc)); - case SEQUENTIAL: - if ( collapsedPos || collapsedDinuc ) - throw new RuntimeException(String.format("Cannot perform sequential recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc)); - } - - // - // initialize the data structures - // - HashMap managers = new HashMap(); - for ( String readGroup : readGroups ) { - // create a manager for each read group - RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc); - managers.put(readGroup, manager); - } - - // add the data to their appropriate readGroup managers - for ( RecalData datum : data ) { - managers.get(datum.readGroup).addDatum(datum); - } - - // fill in the table with mapping objects - for ( String readGroup : readGroups ) { - RecalDataManager manager = managers.get(readGroup); - RecalMapping mapper = initializeMapper(manager, rawQempirical, dinucs, maxPos, maxQReported); - logger.info(String.format("Creating mapper for %s of class %s", readGroup, mapper)); - cache.put(readGroup, mapper); + // Create the collapsed tables that are used in the sequential calculation + if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { + logger.info( "Generating tables of empirical qualities for use in sequential calculation..." ); + dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING ); + logger.info( "...done!" ); } } /** - * Returns a new RecalMapping object appropriate for the requested mode in this.mode for the RecalData - * in manager. Uses the dinucs set dinucs, maxPos, and maxQReported to build efficient lookup structures - * for mapping from old qual and features -> new qual - * - * @param manager - * @param dinucs - * @param maxPos - * @param maxQReported - * @return + * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * @param line A line of CSV data read from the recalibration table data file */ - private RecalMapping initializeMapper( RecalDataManager manager, final boolean rawQempirical, - Set dinucs, int maxPos, int maxQReported ) { - switch ( mode ) { - case COMBINATORIAL: - case JAFFE: - return new CombinatorialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported); - case SEQUENTIAL: - return new SerialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported); - default: - throw new RuntimeException("Unimplemented recalibration mode: " + mode); + private void addCSVData(String line) { + String[] vals = line.split(","); + ArrayList key = new ArrayList(); + Covariate cov; + int iii; + for( iii = 0; iii < requestedCovariates.size(); iii++ ) { + cov = requestedCovariates.get( iii ); + if( VALIDATE_OLD_RECALIBRATOR ) { + if( iii == 1 ) { // order is different in the old recalibrator + key.add( cov.getValue( vals[2] ) ); + } else if ( iii == 2 ) { + key.add( cov.getValue( vals[1] ) ); + } else { + key.add( cov.getValue( vals[iii] ) ); + } + } else { + key.add( cov.getValue( vals[iii] ) ); + } } + RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) ); + dataManager.addToAllTables( key, datum ); + } - public SAMRecord map(char[] ref, SAMRecord read) { + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read + * @param refBases References bases over the length of the read + * @param read The read to be recalibrated + * @return The read with quality scores replaced + */ + public SAMRecord map( char[] refBases, SAMRecord read ) { + + // WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES}) + // This is done in order to speed up the code + + byte[] originalQuals = read.getBaseQualities(); + // Check if we need to use the original quality scores instead + if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + originalQuals = QualityUtils.fastqToPhred((String)obj); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + } + byte[] recalQuals = originalQuals.clone(); + + // These calls are all expensive so only do them once for each read + final SAMReadGroupRecord readGroup = read.getReadGroup(); + final String readGroupId = readGroup.getReadGroupId(); + String platform = readGroup.getPlatform(); + final boolean isNegStrand = read.getReadNegativeStrandFlag(); + if( USE_SLX_PLATFORM ) { + platform = "ILLUMINA"; + } byte[] bases = read.getReadBases(); + int startPos = 1; + int stopPos = bases.length; - - byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals); - - // Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads - if (read.getReadNegativeStrandFlag()) { - bases = BaseUtils.simpleReverseComplement(bases); - quals = BaseUtils.reverse(quals); + if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed + startPos = 0; + stopPos = bases.length - 1; } - try { - byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals); + ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length ); - // Don't change Q scores below some threshold - preserveQScores(quals, recalQuals, read); + // For each base in the read + for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read - if (read.getReadNegativeStrandFlag()) { // reverse the quals for the neg strand read - recalQuals = BaseUtils.reverse(recalQuals); - quals = BaseUtils.reverse(quals); - } - - read.setBaseQualities(recalQuals); - if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { - read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(quals)); + // Get the covariate values which make up the key + for( Covariate covariate : requestedCovariates ) { + fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here } - return read; - } catch ( StingException e ) { - throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage())); + recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey ); + fullCovariateKey.clear(); } - } - private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) { - for ( int i = 0; i < recalQuals.length; i++ ) { - if ( originalQuals[i] < preserveQScoresLessThan ) { - //System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName()); - recalQuals[i] = originalQuals[i]; - } + preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low + + // SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores + if( platform.equalsIgnoreCase("SOLID") ) { + byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); } } + + read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities + if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read + read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals)); + } + + return read; } /** - * Workhorse routine. Given a read group and an array of bases and quals, returns a new set of recalibrated - * qualities for each base/qual in bases and quals. Uses the RecalMapping object associated with readGroup. + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. * - * @param readGroup - * @param bases - * @param quals - * @return + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * @param key The list of Comparables that were calculated from the covariates + * @return A recalibrated quality score as a byte */ - public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) throws StingException { - if ( readGroup == null ) { throw new StingException("BUG: read group is null"); } - if ( bases == null ) { throw new StingException("BUG: bases array is null"); } - if ( bases.length == 0) { throw new StingException("BUG: bases array is size 0"); } - if ( quals == null ) { throw new StingException("BUG: quals array is null"); } + private byte performSequentialQualityCalculation( List key ) { - byte[] recalQuals = new byte[quals.length]; - RecalMapping mapper = cache.get(readGroup); - if ( mapper == null ) { - // throw new StingException(String.format("BUG: couldn't find RecalMapping for readgroup %s", readGroup)); - // Edited by ebanks on 8/9/09 - // Actually, there is a valid case when the mapper == null: when all reads in a RG are - // unmapped or have mapping quality == 0. In this case, we don't want to die - nor do we - // want to lose these reads - so we just don't alter the quals - for ( int cycle = 0; cycle < bases.length; cycle++ ) - recalQuals[cycle] = quals[cycle]; - return recalQuals; + String readGroupKeyElement = key.get(0).toString(); + int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString()); + byte qualFromRead = (byte)qualityScoreKeyElement; + + // The global quality shift (over the read group only) + collapsedTableKey.add( readGroupKeyElement ); + RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey); + Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey); + double globalDeltaQ = 0.0; + double aggregrateQreported = 0.0; + if( globalDeltaQDatum != null ) { + aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) ); + globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported; } - recalQuals[0] = quals[0]; // can't change the first -- no dinuc - for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc - byte qual = quals[cycle]; - byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual); - - if ( newQual <= 0 || newQual > QualityUtils.MAX_REASONABLE_Q_SCORE ) - throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual)); - - recalQuals[cycle] = newQual; - //System.out.printf("Mapping %d => %d%n", qual, newQual); + // The shift in quality between reported and empirical + collapsedTableKey.add( qualityScoreKeyElement ); + Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey); + double deltaQReported = 0.0; + if( deltaQReportedEmpirical != null ) { + deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; + } + + // The shift in quality due to each covariate by itself in turn + double deltaQCovariates = 0.0; + Double deltaQCovariateEmpirical; + double deltaQPos = 0.0; + double deltaQDinuc = 0.0; + for( int iii = 2; iii < key.size(); iii++ ) { + collapsedTableKey.add( key.get(iii) ); // the given covariate + deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey); + if( deltaQCovariateEmpirical != null ) { + deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); + if( VALIDATE_OLD_RECALIBRATOR ) { + if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator + if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } + } + } + collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list } - return recalQuals; - } + double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; + byte newQualityByte = QualityUtils.boundQual( (int)Math.round(newQuality), QualityUtils.MAX_REASONABLE_Q_SCORE ); - public SAMFileWriter reduceInit() { - return outputBam; + + // Verbose printouts used to validate with old recalibrator + //if(key.contains(null)) { + // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + + collapsedTableKey.clear(); + return newQualityByte; } /** - * Write out the read + * Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold + * @param originalQuals The list of original base quality scores + * @param recalQuals A list of the new recalibrated quality scores */ - public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) { + private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) { + for( int iii = 0; iii < recalQuals.length; iii++ ) { + if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) { + recalQuals[iii] = originalQuals[iii]; + } + } + } + + /** + * Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero + * @param originalQuals The list of original base quality scores + * @param recalQuals A list of the new recalibrated quality scores + * @param colorSpaceQuals The list of color space quality scores for this read + */ + private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) { + for( int iii = 0; iii < recalQuals.length; iii++ ) { + if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet + recalQuals[iii] = originalQuals[iii]; + } + } + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Nothing start the reduce with a new handle to the output bam file + * @return A FileWriter pointing to a new bam file + */ + public SAMFileWriter reduceInit() { + return OUTPUT_BAM; + } + + /** + * Output each read to disk + * @param read The read to output + * @param output The FileWriter to write the read to + * @return The FileWriter + */ + public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { if ( output != null ) { output.addAlignment(read); } else { @@ -337,191 +453,3 @@ public class TableRecalibrationWalker extends ReadWalker cache; - RecalDataManager manager; - - public CombinatorialRecalMapping(RecalDataManager manager, final boolean useRawQempirical, - Set dinucs, int maxPos, int maxQReported ) { - this.manager = manager; - - // initialize the data structure - cache = new ArrayList(RecalData.NDINUCS); - for ( String dinuc : dinucs ) { - cache.add(new byte[maxPos+1][maxQReported+1]); - } - - for ( RecalData datum : manager.getAll() ) { - //System.out.printf("Adding datum %s%n", datum); - byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc)); - int pos = manager.canonicalPos(datum.pos); - if ( table[pos][datum.qual] != 0 ) - throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); - table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical); - //System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical)); - } - } - - public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { - int pos = manager.canonicalPos(cycle); - int index = this.manager.getDinucIndex(prevBase, base); - byte[][] dataTable = index == -1 ? null : cache.get(index); - - if ( dataTable == null && prevBase != 'N' && base != 'N' ) - throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base)); - - return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; - } -} - - -/** - * Implements a serial recalibration of the reads using the combinational table. First, - * we perform a positional recalibration, and then a subsequent dinuc correction. - * - * Given the full recalibration table, we perform the following preprocessing steps: - * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: - * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) - */ -class SerialRecalMapping implements RecalMapping { - private double globalDeltaQ = 0.0; - private double[][] deltaQPosMap, deltaQDinucMap; - double [] deltaQualMap; - RecalData [][] qPosSupports = null, qDinucSupports = null; - - CombinatorialRecalMapping combiMap; - RecalDataManager manager; - - String dinucToLookAt = null; // "CC"; - int posToLookAt = -1; - int qualToLookAt = 25; - - public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical, - Set dinucs, int maxPos, int maxQReported ) { - //combiMap = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported ); - this.manager = manager; - - // initialize the data structure - { - RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll()); - double aggregrateQreported = RecalData.combinedQreported(manager.getAll()); - globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported; - //System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ); - } - - for ( RecalData datum : manager.getAll() ) { - if ( printStateP(datum) ) - System.out.printf("PrintValue: %s%n", datum); - } - - // Jaffe-style - deltaQualMap = new double[maxQReported+1]; - for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) { - deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ; - //System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]); - } - - // calculate the delta Q pos array - deltaQPosMap = new double[maxPos+1][maxQReported+1]; - //qPosSupports = new RecalData[maxPos+1][maxQReported+1]; - for ( RecalData datumAtPosQual : manager.combineDinucs() ) { - double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual]; - updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset); - } - - // calculate the delta Q dinuc array - deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1]; - //qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1]; - for ( RecalData datumAtDinucQual : manager.combineCycles() ) { - double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual]; - updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset); - } - - validateTables(maxPos, maxQReported, dinucs.size(), deltaQPosMap, deltaQDinucMap, useRawQempirical, manager.getAll()); - } - - - private void validateTables(int maxPos, int maxQReported, int nDinucs, - double[][] deltaQPosMap, double[][] deltaQDinucMap, - final boolean useRawQempirical, List data ) { - for ( int i = 0; i < maxPos; i++ ) { - for ( int j = 0; j < maxQReported; j++ ) { - if ( printStateP(i, null, j) ) - System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n", - i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null); - } - } - - for ( int i = 0; i < nDinucs; i++ ) { - for ( int j = 0; j < maxQReported; j++ ) { - String dinuc = RecalData.dinucIndex2bases(i); - if ( printStateP(0, dinuc, j ) ) - System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n", - dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null); - } - } - - for ( RecalData datum : RecalData.sort(data) ) { - byte newQual = getNewQual(datum.readGroup, (byte)datum.dinuc.charAt(0), (byte)datum.dinuc.charAt(1), datum.pos, (byte)datum.qual); - if ( printStateP( datum ) ) { - System.out.printf("Serial mapping %s => %d => %d vs. %d => delta = %d%n", - datum, newQual, datum.empiricalQualByte(useRawQempirical), datum.empiricalQualByte(), newQual - datum.empiricalQualByte(useRawQempirical)); - } - } - } - - private void updateCache( RecalData[][] supports, RecalData datum, final boolean useRawQempirical, - double[][] table, int i, int j, double meanQ ) { - if ( table[i][j] != 0 ) - throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); - double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ; - table[i][j] = deltaQ; - if ( supports != null ) - supports[i][j] = datum; - } - - private boolean printStateP( int cycle, String dinuc, int qual ) { - return posToLookAt != -1 && - ( cycle == 0 || posToLookAt == 0 || cycle == posToLookAt ) && - ( dinucToLookAt == null || dinuc == null || manager.getDinucIndex(dinuc) == -1 || dinucToLookAt.equals(dinuc)) && - ( qualToLookAt == 0 || qual == qualToLookAt ); - } - - private boolean printStateP( RecalData datum ) { - return printStateP(datum.pos, datum.dinuc, datum.qual); - } - - public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { - int pos = manager.canonicalPos(cycle); - int index = this.manager.getDinucIndex(prevBase, base); - - byte newQualByte = qual; - if ( qual < deltaQualMap.length ) { - // it's possible to see a qual not in the recal table -- i.e., we only see Q28 at a dbSNP site - double deltaQual = deltaQualMap[qual]; - double deltaQPos = pos >= deltaQPosMap.length ? 0.0 : deltaQPosMap[pos][qual]; // == length indices no data for last base - double deltaQDinuc = index == -1 ? 0.0 : deltaQDinucMap[index][qual]; // -1 indices N in prev or current base - double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc; - newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE); - - //System.out.println(String.format("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d", - // readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc, newQualByte)); - - if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE ) - throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d", - readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte)); - } - - return newQualByte; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java deleted file mode 100755 index 2ab232713..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ /dev/null @@ -1,484 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.RODRecordList; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.genotype.Variation; - -import java.io.PrintStream; -import java.io.FileNotFoundException; -import java.util.*; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * This walker is designed to work as the first pass in a two-pass processing step. - * It does a by-locus traversal operating only at sites that are not in dbSNP. - * We assume that all mismatches we see are therefore errors. - * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) - * Since there is a large amount of data one can then calculate an empirical probability of error - * given the particular covariates seen at this site, where p(error) = num mismatches / num observations - * The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score) - * The first lines of the output file give the name of the covariate classes that were used for this calculation. - * - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list. - * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. - */ - -@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@WalkerName( "CountCovariatesRefactored" ) -//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests -@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta -public class CovariateCounterWalker extends LocusWalker { - - @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) - private Boolean LIST_ONLY = false; - @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) - private String[] COVARIATES = null; - @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - private boolean USE_ORIGINAL_QUALS = false; - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) - private int WINDOW_SIZE = 3; - @Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file") - private String RECAL_FILE = "output.recal_data.csv"; - @Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.") - private boolean NO_PRINT_HEADER = false; - @Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.") - private boolean VALIDATE_OLD_RECALIBRATOR = false; - @Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.") - private boolean USE_SLX_PLATFORM = false; - - private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested - private IdentityHashMap readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes. - private int sizeOfReadDatumHashMap = 0; - - private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file - private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file - private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file - - //private final String versionNumber = "2.0.0"; // major version, minor version, and build number - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Parse the -cov arguments and create a list of covariates to be used here - * Based on the covariates' estimates for initial capacity allocate the data hashmap - */ - public void initialize() { - - //logger.info( "CovariateCounterWalker version: " + versionNumber ); - // Get a list of all available covariates - final List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); - - // Print and exit if that's what was requested - if ( LIST_ONLY ) { - out.println( "Available covariates:" ); - for( Class covClass : classes ) { - out.println( covClass.getSimpleName() ); - } - out.println(); - - System.exit( 0 ); // early exit here because user requested it - } - - // Warn the user if no dbSNP file was specified - boolean foundDBSNP = false; - for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { - if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) { - foundDBSNP = true; - } - } - if( !foundDBSNP ) { - Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?"); - } - - - // Initialize the requested covariates by parsing the -cov argument - requestedCovariates = new ArrayList(); - int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one - if( VALIDATE_OLD_RECALIBRATOR ) { - requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly - requestedCovariates.add( new QualityScoreCovariate() ); - requestedCovariates.add( new DinucCovariate() ); - } else if( COVARIATES != null ) { - if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used - requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found - requestedCovariates.add( new QualityScoreCovariate() ); - for( Class covClass : classes ) { - try { - Covariate covariate = (Covariate)covClass.newInstance(); - estimatedCapacity *= covariate.estimatedNumberOfBins(); - // Some covariates need parameters (user supplied command line arguments) passed to them - if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } - if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again - requestedCovariates.add( covariate ); - } - } catch ( InstantiationException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); - } catch ( IllegalAccessException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); - } - } - } else { // The user has specified a list of several covariates - int covNumber = 1; - for( String requestedCovariateString : COVARIATES ) { - boolean foundClass = false; - for( Class covClass : classes ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class - foundClass = true; - // Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list - if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) || - (covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) { - throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" ); - } - covNumber++; - try { - // Now that we've found a matching class, try to instantiate it - Covariate covariate = (Covariate)covClass.newInstance(); - estimatedCapacity *= covariate.estimatedNumberOfBins(); - // Some covariates need parameters (user supplied command line arguments) passed to them - if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } - requestedCovariates.add( covariate ); - } catch ( InstantiationException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); - } catch ( IllegalAccessException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); - } - } - } - - if( !foundClass ) { - throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); - } - } - } - } else { // No covariates were specified by the user so add the default, required ones - Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." ); - requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new QualityScoreCovariate() ); - estimatedCapacity = 300 * 40; - } - - logger.info( "The covariates being used here: " ); - logger.info( requestedCovariates ); - - if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception - dataManager = new RecalDataManager( estimatedCapacity ); - readDatumHashMap = new IdentityHashMap(); - } - - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * For each read at this locus get the various covariate values and increment that location in the map based on - * whether or not the base matches the reference at this particular location - * @param tracker The reference metadata tracker - * @param ref The reference context - * @param context The alignment context - * @return Returns 1, but this value isn't used in the reduce step - */ - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - - // pull out anything passed by -B name,type,file that has the name "dbsnp" - final RODRecordList dbsnpRODs = tracker.getTrackData( "dbsnp", null ); - boolean isSNP = false; - if (dbsnpRODs != null) { - for( ReferenceOrderedDatum rod : dbsnpRODs ) { - if( ((Variation)rod).isSNP() ) { - isSNP = true; // at least one of the rods says this is a snp site - break; - } - } - } - - // Only use data from non-dbsnp sites - // Assume every mismatch at a non-dbsnp site is indicitive of poor quality - if( !isSNP ) { - final List reads = context.getReads(); - final List offsets = context.getOffsets(); - SAMRecord read; - int offset; - String readGroupId; - byte[] quals; - byte[] bases; - byte refBase; - byte prevBase; - String platform; - byte[] colorSpaceQuals; - ReadHashDatum readDatum; - boolean isNegStrand; - int mappingQuality; - int length; - - final int numReads = reads.size(); - // For each read at this locus - for( int iii = 0; iii < numReads; iii++ ) { - read = reads.get(iii); - offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct - - readDatum = readDatumHashMap.get( read ); - if( readDatum == null ) { - - // If the HashMap of read objects has grown too large then throw out the (mostly stale) reads - if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger? - readDatumHashMap.clear(); - sizeOfReadDatumHashMap = 0; - } - - // This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again - quals = read.getBaseQualities(); - // Check if we need to use the original quality scores instead - if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } - bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true? - isNegStrand = read.getReadNegativeStrandFlag(); - final SAMReadGroupRecord readGroup = read.getReadGroup(); - readGroupId = readGroup.getReadGroupId(); - platform = readGroup.getPlatform(); - mappingQuality = read.getMappingQuality(); - length = bases.length; - if( USE_SLX_PLATFORM ) { - platform = "ILLUMINA"; - } - - readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length ); - readDatumHashMap.put( read, readDatum ); - sizeOfReadDatumHashMap++; - } - - if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests - - // skip first and last base because there is no dinuc - // BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests. - if( offset > 0 ) { - if( offset < readDatum.length - 1 ) { - // skip if base quality is zero - if( readDatum.quals[offset] > 0 ) { - - refBase = (byte)ref.getBase(); - prevBase = readDatum.bases[offset-1]; - - // Get the complement base strand if we are a negative strand read - if( readDatum.isNegStrand ) { - prevBase = readDatum.bases[offset+1]; - } - - // skip if this base or the previous one was an 'N' or etc. - if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) { - - // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them - colorSpaceQuals = null; - if( readDatum.platform.equalsIgnoreCase("SOLID") ) { - colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); - } - if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet - { - // This base finally passed all the checks, so add it to the big hashmap - updateDataFromRead( readDatum, offset, refBase ); - } - } else { - if( VALIDATE_OLD_RECALIBRATOR ) { - countedBases++; // replicating a small bug in the old recalibrator - } - } - } - } else { // at the last base in the read so we can remove it from our IdentityHashMap - readDatumHashMap.remove( read ); - sizeOfReadDatumHashMap--; - } - } - } - } - countedSites++; - - } else { // We skipped over the dbSNP site - skippedSites++; - } - - return 1; // This value isn't actually used anywhere - } - - /** - * Major workhorse routine for this walker. - * Loop through the list of requested covariates and pick out the value from the read, offset, and reference - * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches - * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls - * because pulling things out of the SAMRecord is an expensive operation. - * @param readDatum The ReadHashDatum holding all the important properties of this read - * @param offset The offset in the read for this locus - * @param refBase The reference base at this locus - */ - private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) { - - List key = new ArrayList(); - - // Loop through the list of requested covariates and pick out the value from the read, offset, and reference - for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( readDatum, offset ) ); - } - - // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap - RecalDatum datum = dataManager.data.get( key ); - if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it - datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method - if( VALIDATE_OLD_RECALIBRATOR ) { - dataManager.data.myPut( key, datum ); - } else { - dataManager.data.put( key, datum ); - } - } - - // Need the bases to determine whether or not we have a mismatch - byte base = readDatum.bases[offset]; - - // Add one to the number of observations and potentially one to the number of mismatches - datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad - countedBases++; - } - - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker. - * @return returns A PrintStream created from the -rf filename - */ - public PrintStream reduceInit() { - try { - return new PrintStream( RECAL_FILE ); - } catch ( FileNotFoundException e ) { - throw new RuntimeException( "Couldn't open output file: ", e ); - } - } - - /** - * The Reduce method doesn't do anything for this walker. - * @param value Result of the map. This value is immediately ignored. - * @param recalTableStream The PrintStream used to output the CSV data - * @return returns The PrintStream used to output the CSV data - */ - public PrintStream reduce( Integer value, PrintStream recalTableStream ) { - return recalTableStream; // nothing to do here - } - - /** - * Write out the full data hashmap to disk in CSV format - * @param recalTableStream The PrintStream to write out to - */ - public void onTraversalDone( PrintStream recalTableStream ) { - logger.info( "Writing raw recalibration data..." ); - outputToCSV( recalTableStream ); - logger.info( "...done!" ); - - recalTableStream.close(); - } - - /** - * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format - * @param recalTableStream The PrintStream to write out to - */ - private void outputToCSV( final PrintStream recalTableStream ) { - - if( VALIDATE_OLD_RECALIBRATOR ) { - recalTableStream.printf("# collapsed_pos false%n"); - recalTableStream.printf("# collapsed_dinuc false%n"); - recalTableStream.printf("# counted_sites %d%n", countedSites); - recalTableStream.printf("# counted_bases %d%n", countedBases); - recalTableStream.printf("# skipped_sites %d%n", skippedSites); - recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n"); - // For each entry in the data hashmap - for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { - // For each Covariate in the key - for( Comparable comp : entry.first ) { - // Output the Covariate's value - recalTableStream.print( comp + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( entry.second.outputToCSV() ); - } - } else { - if( !NO_PRINT_HEADER ) { - recalTableStream.printf("# Counted Sites %d%n", countedSites); - recalTableStream.printf("# Counted Bases %d%n", countedBases); - recalTableStream.printf("# Skipped Sites %d%n", skippedSites); - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - for( Covariate cov : requestedCovariates ) { - // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name - recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); - } - } - // For each entry in the data hashmap - for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { - // For each Covariate in the key - for( Comparable comp : entry.getKey() ) { - // Output the Covariate's value - if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG - recalTableStream.print( comp + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( entry.getValue().outputToCSV() ); - } - } - } -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java deleted file mode 100755 index 693fedad9..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ /dev/null @@ -1,204 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.utils.QualityUtils; - -import java.util.*; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 6, 2009 - * - * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. - */ - -public class RecalDataManager { - public NHashMap data; // the full dataset - private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed - private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed - private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed - public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed - - private NHashMap dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed - private NHashMap dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed - private ArrayList> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed - - - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams - - RecalDataManager() { - data = new NHashMap(); - } - - RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // initialize all the collapsed tables - dataCollapsedReadGroup = new NHashMap(); - dataCollapsedQualityScore = new NHashMap(); - dataCollapsedByCovariate = new ArrayList>(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add( new NHashMap() ); - } - dataSumExpectedErrors = new NHashMap(); - } else { - data = new NHashMap( estimatedCapacity, 0.8f); - } - } - - RecalDataManager( final int estimatedCapacity ) { - data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', - // a number to monkey around with when optimizing performace of the HashMap - } - - /** - * Add the given mapping to all of the collapsed hash tables - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping - */ - public final void addToAllTables( final List key, final RecalDatum fullDatum ) { - - // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around - //data.put(key, thisDatum); // add the mapping to the main table - - // create dataCollapsedReadGroup, the table where everything except read group has been collapsed - ArrayList newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group - RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); - } else { - collapsedDatum.increment(fullDatum); - } - - // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed - newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group - Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); - if( sumExpectedErrors == null ) { - dataSumExpectedErrors.put( newKey, 0.0 ); - } else { - dataSumExpectedErrors.remove( newKey ); - sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations(); - dataSumExpectedErrors.put( newKey, sumExpectedErrors ); - } - - newKey = new ArrayList(); - // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed - newKey.add( key.get(0) ); // make a new key with the read group ... - newKey.add( key.get(1) ); // and quality score - collapsedDatum = dataCollapsedQualityScore.get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) ); - } else { - collapsedDatum.increment(fullDatum); - } - - // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted - newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with the read group ... - newKey.add( key.get(1) ); // and quality score ... - newKey.add( key.get(iii + 2) ); // and the given covariate - collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); - } else { - collapsedDatum.increment(fullDatum); - } - } - } - - /** - * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score - * that will be used in the sequential calculation in TableRecalibrationWalker - * @param numCovariates The number of covariates you have determines the number of tables to create - * @param smoothing The smoothing paramter that goes into empirical quality score calculation - */ - public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) { - - dataCollapsedReadGroupDouble = new NHashMap(); - dataCollapsedQualityScoreDouble = new NHashMap(); - dataCollapsedByCovariateDouble = new ArrayList>(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariateDouble.add( new NHashMap() ); - } - - // Hash the empirical quality scores so we don't have to call Math.log at every base for every read - // Looping over the entrySet is really expensive but worth it - for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { - dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); - } - for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { - dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); - } - for( int iii = 0; iii < numCovariates - 2; iii++ ) { - for( Map.Entry,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) { - dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); - } - } - - dataCollapsedQualityScore.clear(); - dataCollapsedByCovariate.clear(); - dataCollapsedQualityScore = null; // will never need this again - dataCollapsedByCovariate = null; // will never need this again - if( data!=null ) { - data.clear(); - data = null; // will never need this again - } - } - - /** - * Get the appropriate collapsed table out of the set of all the tables held by this Object - * @param covariate Which covariate indexes the desired collapsed HashMap - * @return The desired collapsed HashMap - */ - public final NHashMap getCollapsedTable( final int covariate ) { - if( covariate == 0) { - return dataCollapsedReadGroup; // table where everything except read group has been collapsed - } else if( covariate == 1 ) { - return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed - } else { - return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed - } - } - - /** - * Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object - * @param covariate Which covariate indexes the desired collapsed NHashMap - * @return The desired collapsed NHashMap - */ - public final NHashMap getCollapsedDoubleTable( final int covariate ) { - if( covariate == 0) { - return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed - } else if( covariate == 1 ) { - return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed - } else { - return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java deleted file mode 100755 index 63d15af6f..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ /dev/null @@ -1,454 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.*; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Pattern; -import java.io.File; -import java.io.FileNotFoundException; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. - - * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) - * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. - * This walker then outputs a new bam file with these updated (recalibrated) reads. - * - * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. - * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. - */ - -@WalkerName("TableRecalibrationRefactored") -@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the - // GATK to not actually spend time giving me the refBase array since I don't use it -public class TableRecalibrationWalker extends ReadWalker { - - @Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true) - private String RECAL_FILE = null; - @Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false) - private SAMFileWriter OUTPUT_BAM = null; - @Argument(fullName="preserve_qscores_less_than", shortName="pQ", - doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) - private int PRESERVE_QSCORES_LESS_THAN = 5; - @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - private boolean USE_ORIGINAL_QUALS = false; - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) - private int WINDOW_SIZE = 3; - @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") - private int SMOOTHING = 1; - @Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.") - private boolean VALIDATE_OLD_RECALIBRATOR = false; - @Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.") - private boolean USE_SLX_PLATFORM = false; - - //public enum RecalibrationMode { - // COMBINATORIAL, - // SEQUENTIAL, - // ERROR - //} - - //@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false) - private String MODE_STRING = "SEQUENTIAL"; - //public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes? - - private RecalDataManager dataManager; - private ArrayList requestedCovariates; - private ArrayList fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values - private ArrayList collapsedTableKey; // the key that will be used over and over again to query the collapsed tables - - private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); - - //private final String versionNumber = "2.0.0"; // major version, minor version, and build number - - //--------------------------------------------------------------------------------------------------------------- - // - // initialize - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Read in the recalibration table input file. - * Parse the list of covariate classes used during CovariateCounterWalker. - * Parse the CSV data and populate the hashmap. - */ - public void initialize() { - - //logger.info( "TableRecalibrationWalker version: " + versionNumber ); - // Get a list of all available covariates - List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); - - int lineNumber = 0; - boolean foundAllCovariates = false; - int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one - - // Warn the user if a dbSNP file was specified since it isn't being used here - boolean foundDBSNP = false; - for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { - if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) { - foundDBSNP = true; - } - } - if( foundDBSNP ) { - Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it."); - } - - fullCovariateKey = new ArrayList(); - collapsedTableKey = new ArrayList(); - - // Read in the covariates that were used from the input file - requestedCovariates = new ArrayList(); - - // Read in the data from the csv file and populate the map - logger.info( "Reading in the data from input file..." ); - - try { - for ( String line : new xReadLines(new File( RECAL_FILE )) ) { - lineNumber++; - if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) { - ; // skip over the comment lines, (which start with '#') - } - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE ); - } else { // found another covariate in input file - boolean foundClass = false; - for( Class covClass : classes ) { - - if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names - foundClass = true; - try { - Covariate covariate = (Covariate)covClass.newInstance(); - // some covariates need parameters (user supplied command line arguments) passed to them - if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } - requestedCovariates.add( covariate ); - estimatedCapacity *= covariate.estimatedNumberOfBins(); - - } catch ( InstantiationException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); - } catch ( IllegalAccessException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); - } - } - } - - if( !foundClass ) { - throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." ); - } - - } - - } else { // found some data - if( !foundAllCovariates ) { - if( VALIDATE_OLD_RECALIBRATOR ) { - requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new QualityScoreCovariate() ); - requestedCovariates.add( new CycleCovariate() ); - requestedCovariates.add( new DinucCovariate() ); - } - foundAllCovariates = true; - if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception - final boolean createCollapsedTables = true; - // Initialize the data hashMaps - dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() ); - - } - addCSVData(line); // parse the line and add the data to the HashMap - } - } - - } catch ( FileNotFoundException e ) { - Utils.scareUser("Can not find input file: " + RECAL_FILE); - } catch ( NumberFormatException e ) { - throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); - } - logger.info( "...done!" ); - - logger.info( "The covariates being used here: " ); - logger.info( requestedCovariates ); - - // Create the collapsed tables that are used in the sequential calculation - if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { - logger.info( "Generating tables of empirical qualities for use in sequential calculation..." ); - dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING ); - logger.info( "...done!" ); - } - } - - /** - * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) - * @param line A line of CSV data read from the recalibration table data file - */ - private void addCSVData(String line) { - String[] vals = line.split(","); - ArrayList key = new ArrayList(); - Covariate cov; - int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - if( VALIDATE_OLD_RECALIBRATOR ) { - if( iii == 1 ) { // order is different in the old recalibrator - key.add( cov.getValue( vals[2] ) ); - } else if ( iii == 2 ) { - key.add( cov.getValue( vals[1] ) ); - } else { - key.add( cov.getValue( vals[iii] ) ); - } - } else { - key.add( cov.getValue( vals[iii] ) ); - } - } - RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) ); - dataManager.addToAllTables( key, datum ); - - } - - //--------------------------------------------------------------------------------------------------------------- - // - // map - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read - * @param refBases References bases over the length of the read - * @param read The read to be recalibrated - * @return The read with quality scores replaced - */ - public SAMRecord map( char[] refBases, SAMRecord read ) { - - // WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES}) - // This is done in order to speed up the code - - byte[] originalQuals = read.getBaseQualities(); - // Check if we need to use the original quality scores instead - if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - originalQuals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } - byte[] recalQuals = originalQuals.clone(); - - // These calls are all expensive so only do them once for each read - final SAMReadGroupRecord readGroup = read.getReadGroup(); - final String readGroupId = readGroup.getReadGroupId(); - String platform = readGroup.getPlatform(); - final boolean isNegStrand = read.getReadNegativeStrandFlag(); - if( USE_SLX_PLATFORM ) { - platform = "ILLUMINA"; - } - byte[] bases = read.getReadBases(); - int startPos = 1; - int stopPos = bases.length; - - if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed - startPos = 0; - stopPos = bases.length - 1; - } - - ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length ); - - // For each base in the read - for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read - - // Get the covariate values which make up the key - for( Covariate covariate : requestedCovariates ) { - fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here - } - - recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey ); - fullCovariateKey.clear(); - } - - preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low - - // SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores - if( platform.equalsIgnoreCase("SOLID") ) { - byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); - if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); } - } - - read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities - if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read - read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals)); - } - - return read; - } - - /** - * Implements a serial recalibration of the reads using the combinational table. - * First, we perform a positional recalibration, and then a subsequent dinuc correction. - * - * Given the full recalibration table, we perform the following preprocessing steps: - * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: - * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) - * @param key The list of Comparables that were calculated from the covariates - * @return A recalibrated quality score as a byte - */ - private byte performSequentialQualityCalculation( List key ) { - - String readGroupKeyElement = key.get(0).toString(); - int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString()); - byte qualFromRead = (byte)qualityScoreKeyElement; - - // The global quality shift (over the read group only) - collapsedTableKey.add( readGroupKeyElement ); - RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey); - Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey); - double globalDeltaQ = 0.0; - double aggregrateQreported = 0.0; - if( globalDeltaQDatum != null ) { - aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) ); - globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported; - } - - // The shift in quality between reported and empirical - collapsedTableKey.add( qualityScoreKeyElement ); - Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey); - double deltaQReported = 0.0; - if( deltaQReportedEmpirical != null ) { - deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; - } - - // The shift in quality due to each covariate by itself in turn - double deltaQCovariates = 0.0; - Double deltaQCovariateEmpirical; - double deltaQPos = 0.0; - double deltaQDinuc = 0.0; - for( int iii = 2; iii < key.size(); iii++ ) { - collapsedTableKey.add( key.get(iii) ); // the given covariate - deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey); - if( deltaQCovariateEmpirical != null ) { - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); - if( VALIDATE_OLD_RECALIBRATOR ) { - if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator - if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } - } - } - collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list - } - - double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - byte newQualityByte = QualityUtils.boundQual( (int)Math.round(newQuality), QualityUtils.MAX_REASONABLE_Q_SCORE ); - - - // Verbose printouts used to validate with old recalibrator - //if(key.contains(null)) { - // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", - // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); - //} - //else { - // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", - // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); - //} - - collapsedTableKey.clear(); - return newQualityByte; - } - - /** - * Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold - * @param originalQuals The list of original base quality scores - * @param recalQuals A list of the new recalibrated quality scores - */ - private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) { - for( int iii = 0; iii < recalQuals.length; iii++ ) { - if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) { - recalQuals[iii] = originalQuals[iii]; - } - } - } - - /** - * Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero - * @param originalQuals The list of original base quality scores - * @param recalQuals A list of the new recalibrated quality scores - * @param colorSpaceQuals The list of color space quality scores for this read - */ - private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) { - for( int iii = 0; iii < recalQuals.length; iii++ ) { - if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet - recalQuals[iii] = originalQuals[iii]; - } - } - } - - //--------------------------------------------------------------------------------------------------------------- - // - // reduce - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Nothing start the reduce with a new handle to the output bam file - * @return A FileWriter pointing to a new bam file - */ - public SAMFileWriter reduceInit() { - return OUTPUT_BAM; - } - - /** - * Output each read to disk - * @param read The read to output - * @param output The FileWriter to write the read to - * @return The FileWriter - */ - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { - if ( output != null ) { - output.addAlignment(read); - } else { - out.println(read.format()); - } - - return output; - } -} diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java deleted file mode 100755 index ded6fd535..000000000 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java +++ /dev/null @@ -1,184 +0,0 @@ -// our package -package org.broadinstitute.sting.gatk.walkers.recalibration; - -import java.util.*; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileHeader; - -// the imports for unit testing. -import org.junit.Assert; -import org.junit.Test; -import org.junit.Before; -import org.junit.BeforeClass; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader; -import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; - -import java.util.HashSet; -import java.io.FileNotFoundException; -import java.io.File; - -/** - * Basic unit test for RecalData - */ -public class CovariateCounterTest extends BaseTest { - String readGroup1 = "rg1"; - String readGroup2 = "rg2"; - Set readGroups = new HashSet(); - - SAMFileHeader header; - - SAMRecord read1, read2, read3; - - byte bases1[] = {'a', 't', 'c', 'g', 'a'}; - byte quals1[] = {1, 2, 3, 4, 5}; - byte quals3[] = {1, 2, 5, 5, 5}; - byte bases2[] = {'t', 'c', 'g', 'a', 't'}; - byte quals2[] = {2, 2, 4, 5, 2}; - - /* - public CovariateCounter( Set readGroups, boolean collapsePos, boolean collapseDinuc ) { - public Set getReadGroups() { - public boolean isCollapseDinuc() { - public boolean isCollapsePos() { - public int getNReadGroups() { - private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { - public List getRecalData(String readGroup) { - public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) { - */ - - /** - * The fasta, for comparison. - */ - protected static IndexedFastaSequenceFile sequenceFile = null; - - CovariateCounter c; - - /** - * Initialize the fasta. - */ - @BeforeClass - public static void initialize() throws FileNotFoundException { - sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") ); - GenomeLocParser.setupRefContigOrdering(sequenceFile); - - } - - @Before - public void initializeBefore() { - header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719); - readGroups.addAll(Arrays.asList(readGroup1, readGroup2)); - ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" ); - ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" ); - c = new CovariateCounter( readGroups, false, false, false ); - - read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1); - read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2); - read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3); - } - - - @Test - public void testCovariateCounterSetup() { - Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2); - Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups); - Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false); - Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false); - } - - @Test - public void testOneRead() { - for ( int i = 1; i < read1.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false); - c.printState(); - - Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0); - for ( int i = 1; i < bases1.length; i++ ) { - RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); - System.out.printf("%s%n", datum); - Assert.assertNotNull("Incorrect mapping to recal bin", datum); - Assert.assertEquals("Bad mismatch count", datum.B, 0); - Assert.assertEquals("Bad base count", datum.N, 1); - Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); - Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); - Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); - } - } - - @Test - public void testTwoReads() { - for ( int i = 1; i < read1.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false); - for ( int i = 1; i < read2.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i], false); - c.printState(); - - Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0); - for ( int i = 1; i < bases1.length; i++ ) { - RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); - System.out.printf("%s%n", datum); - Assert.assertNotNull("Incorrect mapping to recal bin", datum); - Assert.assertEquals("Bad mismatch count", datum.B, 0); - Assert.assertEquals("Bad base count", datum.N, 1); - Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); - Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); - Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); - } - } - - @Test - public void testTwoReadsSameGroup() { - for ( int i = 1; i < read1.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false); - for ( int i = 1; i < read2.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false); - c.printState(); - - for ( int i = 1; i < bases1.length; i++ ) { - RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); - System.out.printf("%s%n", datum); - Assert.assertNotNull("Incorrect mapping to recal bin", datum); - Assert.assertEquals("Bad mismatch count", datum.B, 0); - Assert.assertEquals("Bad base count", datum.N, 2); - Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); - Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); - Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); - } - } - - @Test - public void testTwoReadsSameGroupNotIdentical() { - for ( int i = 1; i < read1.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false); - for ( int i = 1; i < read3.getReadBases().length; i++ ) - c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i], false); - c.printState(); - - for ( int i = 1; i < bases1.length; i++ ) { - RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); - System.out.printf("%s%n", datum); - Assert.assertNotNull("Incorrect mapping to recal bin", datum); - Assert.assertEquals("Bad mismatch count", datum.B, 0); - Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1); - Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); - Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); - Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); - } - } - - @Test (expected = RuntimeException.class) - public void testBadReadOffset() { - byte bases[] = {'a', 't', 'c', 'g', 'a'}; - byte quals[] = {1, 2, 3, 4, 5}; - - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals); - - c.updateDataFromRead(readGroup1, read, 0, (char)bases[0], false); - } - -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java deleted file mode 100755 index 4d6098345..000000000 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataTest.java +++ /dev/null @@ -1,183 +0,0 @@ -// our package -package org.broadinstitute.sting.gatk.walkers.recalibration; - - -// the imports for unit testing. - -import org.junit.Assert; -import org.junit.Test; -import org.junit.Before; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.QualityUtils; -import java.util.HashSet; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Basic unit test for RecalData - */ -public class RecalDataTest extends BaseTest { - RecalData datum, starDatum; - - /** - * Tests that we got a string parameter in correctly - */ - @Before - public void before() { - datum = new RecalData(2, 3, "0", "AA"); - datum.B = datum.N = 1; - starDatum = new RecalData(2, 3, "0", "**"); - starDatum.B = starDatum.N = 1; - } - - - @Test - public void testBasic() { - logger.warn("Executing testIsBetween"); - - Assert.assertTrue(datum.N == 1); - Assert.assertTrue(datum.B == 1); - Assert.assertTrue(datum.pos == 2); - Assert.assertTrue(datum.qual == 3); - Assert.assertTrue(datum.readGroup.equals("0")); - Assert.assertTrue(datum.dinuc.equals("AA")); - - Assert.assertTrue(starDatum.N == 1); - Assert.assertTrue(starDatum.B == 1); - Assert.assertTrue(starDatum.pos == 2); - Assert.assertTrue(starDatum.qual == 3); - Assert.assertTrue(starDatum.readGroup.equals("0")); - Assert.assertTrue(starDatum.dinuc.equals("**")); - } - - @Test - public void testInc() { - logger.warn("Executing testInc"); - - datum.inc(1L, 0L); - Assert.assertTrue(datum.N == 2); - Assert.assertTrue(datum.B == 1); - - datum.inc('A', 'A'); - Assert.assertTrue(datum.N == 3); - Assert.assertTrue(datum.B == 1); - - datum.inc('A', 'C'); - Assert.assertTrue(datum.N == 4); - Assert.assertTrue(datum.B == 2); - } - - - @Test - public void testEmpQual() { - logger.warn("Executing testEmpQual"); - - datum.B = 1; - datum.N = 1; - Assert.assertEquals(datum.empiricalQualDouble(), 0.0, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 1); - - datum.B = 1; - datum.N = 2; - Assert.assertEquals(datum.empiricalQualDouble(), 3.0103, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 3); - - datum.B = 1; - datum.N = 3; - Assert.assertEquals(datum.empiricalQualDouble(), 4.771213, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 5); - - datum.B = 1; - datum.B = 2; - Assert.assertEquals(datum.empiricalQualDouble(), 1.760913, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 2); - - datum.B = 1; - datum.N = 10; - Assert.assertEquals(datum.empiricalQualDouble(), 10.0, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 10); - - datum.B = 1; - datum.N = 100; - Assert.assertEquals(datum.empiricalQualDouble(), 20.0, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 20); - - datum.B = 1; - datum.N = 1000; - Assert.assertEquals(datum.empiricalQualDouble(), 30.0, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), 30); - - datum.B = 0; - datum.N = 1000; - Assert.assertEquals(datum.empiricalQualDouble(), QualityUtils.MAX_REASONABLE_Q_SCORE, 1e-5); - Assert.assertEquals(datum.empiricalQualByte(), QualityUtils.MAX_REASONABLE_Q_SCORE); - } - - - public void testtoCSVString() { - logger.warn("Executing testtoCSVString"); - - Assert.assertEquals(datum.toCSVString(false), "0,AA,3,2,1,1,0"); - Assert.assertEquals(datum.toCSVString(true), "0,AA,3,*,1,1,0"); - } - - - public void testFromCSVString() { - logger.warn("Executing testFromCSVString"); - - Assert.assertEquals(RecalData.fromCSVString("0,AA,3,2,1,1,0").toCSVString(false), datum.toCSVString(false)); - Assert.assertEquals(RecalData.fromCSVString("0,AA,3,*,1,1,0").toCSVString(false), datum.toCSVString(true)); - Assert.assertEquals(RecalData.fromCSVString("0,**,3,2,1,1,0").toCSVString(false), starDatum.toCSVString(false)); - Assert.assertEquals(RecalData.fromCSVString("0,**,3,*,1,1,0").toCSVString(false), starDatum.toCSVString(true)); - } - - public void testDinucIndex() { - logger.warn("Executing testDinucIndex"); - - HashSet indices = new HashSet(); - HashSet unknownBytes = new HashSet(); - byte bases[] = {'A', 'C', 'G', 'T', '*', 'N'}; - unknownBytes.add((byte)'*'); - unknownBytes.add((byte)'N'); - - for ( int i = 0; i < bases.length; i++ ) { - for ( int j = 0; j < bases.length; j++ ) { - byte[] bp = {bases[i], bases[j]}; - String s = new String(bp); - int index = RecalData.dinucIndex(s); - indices.add(index); - Assert.assertEquals(index, RecalData.dinucIndex(bases[i], bases[j])); - if ( index != -1 ) { - Assert.assertEquals(RecalData.dinucIndex2bases(index), s); - } else { - Assert.assertTrue(unknownBytes.contains(bp[0]) || unknownBytes.contains(bp[1]) ); - } - } - } - Assert.assertEquals(indices.size(), 17); - } - -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index fa2159b80..3adea5588 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -29,7 +29,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -T CountCovariates" + " -I " + bam + " -L 1:10,000,000-11,000,000" + - " --params %s", + " --validate_old_recalibrator" + + " --use_slx_platform" + + " -recalFile %s", 1, // just one output file Arrays.asList(md5)); List result = executeTest("testCountCovariates1", spec).getFirst(); @@ -56,8 +58,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -T TableRecalibration" + " -I " + bam + " -L 1:10,000,000-20,000,000" + - " --outputBam %s" + - " --params " + paramsFile, + " --validate_old_recalibrator" + + " --use_slx_platform" + + " -outputBam %s" + + " -recalFile " + paramsFile, 1, // just one output file Arrays.asList(md5)); executeTest("testTableRecalibrator1", spec); diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RefactoredRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RefactoredRecalibrationWalkersIntegrationTest.java deleted file mode 100755 index 0fb7c475c..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RefactoredRecalibrationWalkersIntegrationTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.WalkerTest; -import org.junit.Test; - -import java.util.HashMap; -import java.util.Map; -import java.util.Arrays; -import java.util.List; -import java.io.File; - -public class RefactoredRecalibrationWalkersIntegrationTest extends WalkerTest { - static HashMap paramsFiles = new HashMap(); - - @Test - public void testCountCovariatesR() { - HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" ); - //e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" ); - - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R /broad/1KG/reference/human_b36_both.fasta" + - " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + - " -T CountCovariatesRefactored" + - " -I " + bam + - " -L 1:10,000,000-11,000,000" + - " --validate_old_recalibrator" + - " --use_slx_platform" + - " -recalFile %s", - 1, // just one output file - Arrays.asList(md5)); - List result = executeTest("testCountCovariatesR", spec).getFirst(); - paramsFiles.put(bam, result.get(0).getAbsolutePath()); - } - } - - @Test - public void testTableRecalibratorR() { - HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "3ace4b9b8495429cc32e7008145f4996" ); - //e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" ); - //e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "" ); - - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); - String paramsFile = paramsFiles.get(bam); - System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile); - if ( paramsFile != null ) { - WalkerTestSpec spec = new WalkerTestSpec( - "-R /broad/1KG/reference/human_b36_both.fasta" + - " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + - " -T TableRecalibrationRefactored" + - " -I " + bam + - " -L 1:10,000,000-20,000,000" + - " --validate_old_recalibrator" + - " --use_slx_platform" + - " -outputBam %s" + - " -recalFile " + paramsFile, - 1, // just one output file - Arrays.asList(md5)); - executeTest("testTableRecalibratorR", spec); - } - } - } -} diff --git a/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java b/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java index 3f0d91760..6a9ab005f 100755 --- a/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java +++ b/java/test/org/broadinstitute/sting/utils/ExpandingArrayListTest.java @@ -8,7 +8,6 @@ import org.junit.Assert; import org.junit.Test; import org.junit.Before; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData; import org.broadinstitute.sting.utils.QualityUtils; import java.util.HashSet; import java.util.Arrays; diff --git a/java/test/org/broadinstitute/sting/utils/MergingIteratorTest.java b/java/test/org/broadinstitute/sting/utils/MergingIteratorTest.java index a35e8d102..997b91d16 100644 --- a/java/test/org/broadinstitute/sting/utils/MergingIteratorTest.java +++ b/java/test/org/broadinstitute/sting/utils/MergingIteratorTest.java @@ -8,7 +8,6 @@ import org.junit.Assert; import org.junit.Test; import org.junit.Before; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData; import org.broadinstitute.sting.utils.QualityUtils; import java.util.*;