The old recalibrator is replaced with the refactored recalibrator. Added a version message to the logger output. These walkers start at version 2.0.0
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2117 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dfe7d69471
commit
1d46de6d34
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import javax.management.RuntimeErrorException;
|
||||
import java.util.*;
|
||||
|
||||
public class CovariateCounter {
|
||||
private boolean collapsePos = false;
|
||||
private boolean collapseDinuc = false;
|
||||
private boolean assumeFaultyHeader = false;
|
||||
|
||||
private HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||
|
||||
protected static Logger logger = Logger.getLogger(CovariateCounter.class);
|
||||
|
||||
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc, boolean assumeFaultyHeader ) {
|
||||
this.collapsePos = collapsePos;
|
||||
this.collapseDinuc = collapseDinuc;
|
||||
this.assumeFaultyHeader = assumeFaultyHeader;
|
||||
|
||||
for (String readGroup : readGroups ) {
|
||||
RecalDataManager manager = makeManager(readGroup);
|
||||
data.put(readGroup, manager);
|
||||
}
|
||||
}
|
||||
|
||||
private RecalDataManager makeManager(final String readGroup) {
|
||||
return new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc );
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of readGroup names we are counting covariates for
|
||||
* @return
|
||||
*/
|
||||
public Set<String> getReadGroups() {
|
||||
return data.keySet();
|
||||
}
|
||||
|
||||
public boolean isCollapseDinuc() {
|
||||
return collapseDinuc;
|
||||
}
|
||||
|
||||
public boolean isCollapsePos() {
|
||||
return collapsePos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of read groups being managed
|
||||
* @return
|
||||
*/
|
||||
public int getNReadGroups() {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
|
||||
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
|
||||
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
|
||||
* system so that subsequent requests will return that object, and returns it.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param pos
|
||||
* @param qual
|
||||
* @param prevBase
|
||||
* @param base
|
||||
* @return
|
||||
*/
|
||||
public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
if ( ! data.containsKey(readGroup) ) {
|
||||
if ( assumeFaultyHeader ) {
|
||||
logger.info(String.format("Found unexpected read group, but assuming the header was bad, so extending covariates with read group %s", readGroup));
|
||||
RecalDataManager manager = makeManager(readGroup);
|
||||
data.put(readGroup, manager);
|
||||
} else {
|
||||
throw new RuntimeException(String.format("Unexpected read group %s found, there's something wrong with your BAM file's header", readGroup));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
byte[] cs = {(byte)prevBase, (byte)base};
|
||||
String s = new String(cs);
|
||||
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of all of the RecalData associated with readGroup
|
||||
*
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
public List<RecalData> getRecalData(String readGroup) {
|
||||
return data.get(readGroup).getAll();
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
|
||||
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
|
||||
* but by implied machine cycle associated with the offset.
|
||||
*
|
||||
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
|
||||
* TODO: and offset is 0-based. How very annoying.
|
||||
*
|
||||
* @param rg
|
||||
* @param read
|
||||
* @param offset
|
||||
* @param ref
|
||||
* @return
|
||||
*/
|
||||
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref, boolean useOriginalQuals ) {
|
||||
if ( offset == 0 )
|
||||
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
|
||||
|
||||
int cycle = offset;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
|
||||
|
||||
char base = (char)bases[offset];
|
||||
char prevBase = (char)bases[offset - 1];
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
ref = BaseUtils.simpleComplement(ref);
|
||||
base = BaseUtils.simpleComplement(base);
|
||||
prevBase = BaseUtils.simpleComplement((char)bases[offset+1]);
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
}
|
||||
|
||||
int qual = quals[offset];
|
||||
if ( qual > 0 ) {
|
||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||
if (datum != null) datum.inc(base,ref);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void printState() {
|
||||
for ( String readGroup : getReadGroups() ) {
|
||||
for ( RecalData datum : getRecalData(readGroup) ) {
|
||||
if ( datum.N > 0 )
|
||||
System.out.println(datum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,292 +1,485 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
@WalkerName("CountCovariates")
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* This walker is designed to work as the first pass in a two-pass processing step.
|
||||
* It does a by-locus traversal operating only at sites that are not in dbSNP.
|
||||
* We assume that all mismatches we see are therefore errors.
|
||||
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
|
||||
* Since there is a large amount of data one can then calculate an empirical probability of error
|
||||
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
|
||||
* The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score)
|
||||
* The first lines of the output file give the name of the covariate classes that were used for this calculation.
|
||||
*
|
||||
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list.
|
||||
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
|
||||
*/
|
||||
|
||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||
@WalkerName( "CountCovariates" )
|
||||
//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests
|
||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
||||
public int buggyMaxReadLen = 100000;
|
||||
|
||||
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Depreciated output file root -- now use --params to directly specify the file output name")
|
||||
public String OUTPUT_FILEROOT = null; // going to blow up if specified
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
private Boolean LIST_ONLY = false;
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 3;
|
||||
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
|
||||
private boolean NO_PRINT_HEADER = false;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
|
||||
private boolean USE_SLX_PLATFORM = false;
|
||||
|
||||
@Argument(fullName="params", shortName="params", required=false, doc="Filename root for the outputted logistic regression training files")
|
||||
public String params = "output.recal_data.csv";
|
||||
|
||||
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||
public int MIN_MAPPING_QUALITY = 1;
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
|
||||
private int sizeOfReadDatumHashMap = 0;
|
||||
|
||||
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
|
||||
public List<String> platforms = Collections.singletonList("*");
|
||||
//public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
|
||||
@Argument(fullName="assumeFaultyHeader", required=false, doc="")
|
||||
public boolean assumeFaultyHeader = false;
|
||||
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
|
||||
|
||||
//@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||
public boolean collapsePos = false;
|
||||
|
||||
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||
public boolean collapseDinuc = false;
|
||||
|
||||
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
public boolean useOriginalQuals = false;
|
||||
|
||||
|
||||
private CovariateCounter covariateCounter = null;
|
||||
|
||||
private long counted_sites = 0; // number of sites used to count covariates
|
||||
private long counted_bases = 0; // number of bases used to count covariates
|
||||
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||
|
||||
// THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and
|
||||
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
|
||||
// whether masked by overlapping indels/other events or not.
|
||||
//TODO process correctly all the returned dbSNP rods at each location
|
||||
|
||||
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
private static final int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
|
||||
* Parse the -cov arguments and create a list of covariates to be used here
|
||||
* Based on the covariates' estimates for initial capacity allocate the data hashmap
|
||||
*/
|
||||
public void initialize() {
|
||||
Set<String> readGroups = new HashSet<String>();
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getSAMFileHeader().getReadGroups()) {
|
||||
if( readGroup.getAttribute("PL") == null )
|
||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
|
||||
if( !isSupportedReadGroup(readGroup) )
|
||||
continue;
|
||||
readGroups.add(readGroup.getReadGroupId());
|
||||
|
||||
logger.info( "CovariateCounterWalker version: " + versionNumber );
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
out.println( "Available covariates:" );
|
||||
for( Class<?> covClass : classes ) {
|
||||
out.println( covClass.getSimpleName() );
|
||||
}
|
||||
out.println();
|
||||
|
||||
System.exit( 0 ); // early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file was specified
|
||||
if( this.getToolkit().getArguments().DBSNPFile == null ) {
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
if( !foundDBSNP ) {
|
||||
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
|
||||
}
|
||||
|
||||
covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc, assumeFaultyHeader);
|
||||
logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups()));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
} else if( COVARIATES != null ) {
|
||||
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
for( Class<?> covClass : classes ) {
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
|
||||
requestedCovariates.add( covariate );
|
||||
}
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
} else { // The user has specified a list of several covariates
|
||||
int covNumber = 1;
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
|
||||
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
|
||||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
|
||||
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
|
||||
}
|
||||
covNumber++;
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Walk over each read in the locus pileup and update the covariate counts based on these bases and their
|
||||
* matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP.
|
||||
*
|
||||
* @param tracker
|
||||
* @param ref
|
||||
* @param context
|
||||
* @return
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||
|
||||
// long testpos = 10410913 ;
|
||||
// if ( ref.getLocus().getStart() == testpos ) {
|
||||
// System.out.println(rods.size()+" rods:");
|
||||
// for ( ReferenceOrderedDatum d : rods.getRecords() ) System.out.println((((rodDbSNP)d).isSNP()?"SNP":"non-SNP")+" at " + d.getLocation());
|
||||
// System.exit(1);
|
||||
// }
|
||||
|
||||
if ( dbsnp == null ) {
|
||||
// We aren't at a dbSNP position that's a SNP, so update the read
|
||||
|
||||
// if ( ref.getLocus().getStart() == testpos) System.out.println("NOT A SNP INDEED");
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
|
||||
if ( read.getReadLength() > buggyMaxReadLen ) {
|
||||
throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format());
|
||||
}
|
||||
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
|
||||
if ( readGroup == null ) {
|
||||
throw new RuntimeException("No read group annotation found for read " + read.format());
|
||||
}
|
||||
|
||||
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
|
||||
int offset = offsets.get(i);
|
||||
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||
counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref.getBase(), useOriginalQuals);
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
}
|
||||
}
|
||||
}
|
||||
counted_sites += 1;
|
||||
updateMismatchCounts(novel_counts, context, ref.getBase());
|
||||
} else {
|
||||
// if ( ref.getLocus().getStart() == testpos) System.out.println("TREATED AS A SNP");
|
||||
// out.println(ref.getLocus()+" SNP at "+dbsnp.getLocation() );
|
||||
updateMismatchCounts(dbSNP_counts, context, ref.getBase());
|
||||
skipped_sites += 1;
|
||||
} else { // No covariates were specified by the user so add the default, required ones
|
||||
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
estimatedCapacity = 300 * 40;
|
||||
}
|
||||
|
||||
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
lociSinceLastDbsnpCheck = 0;
|
||||
validateDbsnpMismatchRate();
|
||||
}
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
|
||||
return 1;
|
||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||
dataManager = new RecalDataManager( estimatedCapacity );
|
||||
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Update the mismatch / total_base counts for a given class of loci.
|
||||
*
|
||||
* @param counts
|
||||
* @param context
|
||||
* @return
|
||||
*/
|
||||
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < reads.size(); i++ ) {
|
||||
char readChar = reads.get(i).getReadString().charAt(offsets.get(i));
|
||||
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
|
||||
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
|
||||
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
|
||||
if ( readCharBaseIndex != refCharBaseIndex )
|
||||
counts.first++;
|
||||
counts.second++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the dbSNP mismatch rates.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private void validateDbsnpMismatchRate() {
|
||||
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 )
|
||||
return;
|
||||
|
||||
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
|
||||
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
|
||||
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
|
||||
|
||||
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel )
|
||||
Utils.warnUser("The variation rate of dbSNP sites seems suspicious! Please double-check that the correct DBSNP ROD is being used.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check to see whether this read group should be processed. Returns true if the
|
||||
* read group is in the list of platforms to process or the platform == *, indicating
|
||||
* that all platforms should be processed.
|
||||
*
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
|
||||
for( String platform: platforms ) {
|
||||
platform = platform.trim();
|
||||
if( platform.equals("*") || readGroup == null ||
|
||||
readGroup.getAttribute("PL") == null ||
|
||||
readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Reduce
|
||||
// map
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
* @return Initial value of reduce.
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
* @param tracker The reference metadata tracker
|
||||
* @param ref The reference context
|
||||
* @param context The alignment context
|
||||
* @return Returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public PrintStream reduceInit() {
|
||||
try {
|
||||
if ( OUTPUT_FILEROOT != null )
|
||||
throw new RuntimeException("OUTPUT_FILEROOT argument has been removed, please use --params from now on to directly specify the output parameter filename");
|
||||
return new PrintStream( params );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Couldn't open output file", e);
|
||||
}
|
||||
}
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
public void onTraversalDone(PrintStream recalTableStream) {
|
||||
out.printf("Writing raw recalibration data...");
|
||||
writeRecalTable(recalTableStream);
|
||||
out.printf("...done%n");
|
||||
|
||||
//out.printf("Writing logistic recalibration data%n");
|
||||
//writeLogisticRecalibrationTable();
|
||||
//out.printf("...done%n");
|
||||
|
||||
recalTableStream.close();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints some basic information about the CountCovariates run to the output stream out
|
||||
* @param out
|
||||
*/
|
||||
private void printInfo(PrintStream out) {
|
||||
//out.printf("# date \"%s\"%n", new Date());
|
||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
out.printf("# counted_sites %d%n", counted_sites);
|
||||
out.printf("# counted_bases %d%n", counted_bases);
|
||||
out.printf("# skipped_sites %d%n", skipped_sites);
|
||||
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
|
||||
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
|
||||
*/
|
||||
private void writeRecalTable(PrintStream recalTableStream) {
|
||||
printInfo(recalTableStream);
|
||||
|
||||
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||
for (String readGroup : new TreeSet<String>(covariateCounter.getReadGroups()) ) {
|
||||
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
|
||||
if ( datum.N > 0 ) {
|
||||
recalTableStream.println(datum.toCSVString(collapsePos));
|
||||
// pull out anything passed by -B name,type,file that has the name "dbsnp"
|
||||
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
|
||||
boolean isSNP = false;
|
||||
if (dbsnpRODs != null) {
|
||||
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
|
||||
if( ((Variation)rod).isSNP() ) {
|
||||
isSNP = true; // at least one of the rods says this is a snp site
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
|
||||
if( !isSNP ) {
|
||||
final List<SAMRecord> reads = context.getReads();
|
||||
final List<Integer> offsets = context.getOffsets();
|
||||
SAMRecord read;
|
||||
int offset;
|
||||
String readGroupId;
|
||||
byte[] quals;
|
||||
byte[] bases;
|
||||
byte refBase;
|
||||
byte prevBase;
|
||||
String platform;
|
||||
byte[] colorSpaceQuals;
|
||||
ReadHashDatum readDatum;
|
||||
boolean isNegStrand;
|
||||
int mappingQuality;
|
||||
int length;
|
||||
|
||||
final int numReads = reads.size();
|
||||
// For each read at this locus
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
read = reads.get(iii);
|
||||
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
|
||||
|
||||
readDatum = readDatumHashMap.get( read );
|
||||
if( readDatum == null ) {
|
||||
|
||||
// If the HashMap of read objects has grown too large then throw out the (mostly stale) reads
|
||||
if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger?
|
||||
readDatumHashMap.clear();
|
||||
sizeOfReadDatumHashMap = 0;
|
||||
}
|
||||
|
||||
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
|
||||
quals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
|
||||
isNegStrand = read.getReadNegativeStrandFlag();
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
mappingQuality = read.getMappingQuality();
|
||||
length = bases.length;
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
platform = "ILLUMINA";
|
||||
}
|
||||
|
||||
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
|
||||
readDatumHashMap.put( read, readDatum );
|
||||
sizeOfReadDatumHashMap++;
|
||||
}
|
||||
|
||||
if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
|
||||
|
||||
// skip first and last base because there is no dinuc
|
||||
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
|
||||
if( offset > 0 ) {
|
||||
if( offset < readDatum.length - 1 ) {
|
||||
// skip if base quality is zero
|
||||
if( readDatum.quals[offset] > 0 ) {
|
||||
|
||||
refBase = (byte)ref.getBase();
|
||||
prevBase = readDatum.bases[offset-1];
|
||||
|
||||
// Get the complement base strand if we are a negative strand read
|
||||
if( readDatum.isNegStrand ) {
|
||||
prevBase = readDatum.bases[offset+1];
|
||||
}
|
||||
|
||||
// skip if this base or the previous one was an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
|
||||
colorSpaceQuals = null;
|
||||
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
|
||||
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
}
|
||||
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
|
||||
{
|
||||
// This base finally passed all the checks, so add it to the big hashmap
|
||||
updateDataFromRead( readDatum, offset, refBase );
|
||||
}
|
||||
} else {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
countedBases++; // replicating a small bug in the old recalibrator
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // at the last base in the read so we can remove it from our IdentityHashMap
|
||||
readDatumHashMap.remove( read );
|
||||
sizeOfReadDatumHashMap--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
countedSites++;
|
||||
|
||||
} else { // We skipped over the dbSNP site
|
||||
skippedSites++;
|
||||
}
|
||||
|
||||
return 1; // This value isn't actually used anywhere
|
||||
}
|
||||
|
||||
/**
|
||||
* Doesn't do anything
|
||||
*
|
||||
* @param empty
|
||||
* @param recalTableStream
|
||||
* @return
|
||||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param readDatum The ReadHashDatum holding all the important properties of this read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
public PrintStream reduce(Integer empty, PrintStream recalTableStream) {
|
||||
return recalTableStream;
|
||||
private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) {
|
||||
|
||||
List<Comparable> key = new ArrayList<Comparable>();
|
||||
|
||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
key.add( covariate.getValue( readDatum, offset ) );
|
||||
}
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
RecalDatum datum = dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
dataManager.data.myPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
}
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
byte base = readDatum.bases[offset];
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
||||
countedBases++;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||
* @return returns A PrintStream created from the -rf filename
|
||||
*/
|
||||
public PrintStream reduceInit() {
|
||||
try {
|
||||
return new PrintStream( RECAL_FILE );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException( "Couldn't open output file: ", e );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
* @param value Result of the map. This value is immediately ignored.
|
||||
* @param recalTableStream The PrintStream used to output the CSV data
|
||||
* @return returns The PrintStream used to output the CSV data
|
||||
*/
|
||||
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
|
||||
return recalTableStream; // nothing to do here
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the full data hashmap to disk in CSV format
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
public void onTraversalDone( PrintStream recalTableStream ) {
|
||||
logger.info( "Writing raw recalibration data..." );
|
||||
outputToCSV( recalTableStream );
|
||||
logger.info( "...done!" );
|
||||
|
||||
recalTableStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( final PrintStream recalTableStream ) {
|
||||
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
recalTableStream.printf("# collapsed_pos false%n");
|
||||
recalTableStream.printf("# collapsed_dinuc false%n");
|
||||
recalTableStream.printf("# counted_sites %d%n", countedSites);
|
||||
recalTableStream.printf("# counted_bases %d%n", countedBases);
|
||||
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n");
|
||||
// For each entry in the data hashmap
|
||||
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.first ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.second.outputToCSV() );
|
||||
}
|
||||
} else {
|
||||
if( !NO_PRINT_HEADER ) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
|
||||
}
|
||||
}
|
||||
// For each entry in the data hashmap
|
||||
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getKey() ) {
|
||||
// Output the Covariate's value
|
||||
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.getValue().outputToCSV() );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -1,230 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
|
||||
public class RecalData implements Comparable<RecalData> {
|
||||
//
|
||||
// context of this data -- read group, position in read, reported quality, and dinuc
|
||||
//
|
||||
String readGroup; // the read group where the datum was observed
|
||||
int pos; // the position of this recalibration datum
|
||||
int qual; // the reported quality scoree of this datum
|
||||
String dinuc; // the dinucleotide context of this datum
|
||||
|
||||
//
|
||||
// empirical quality score data
|
||||
//
|
||||
long N; // total number of observed bases
|
||||
long B; // number of observed mismatches to the reference base
|
||||
|
||||
|
||||
public RecalData(int pos, int qual, String readGroup, String dinuc ) {
|
||||
this.pos = pos;
|
||||
this.qual = qual;
|
||||
this.readGroup = readGroup;
|
||||
this.dinuc = dinuc;
|
||||
}
|
||||
|
||||
public RecalData( RecalData copy ) {
|
||||
this.pos = copy.pos;
|
||||
this.qual = copy.qual;
|
||||
this.readGroup = copy.readGroup;
|
||||
this.dinuc = copy.dinuc;
|
||||
this.N = copy.N;
|
||||
this.B = copy.B;
|
||||
}
|
||||
|
||||
public int compareTo(RecalData to) {
|
||||
int cmpReadGroup = readGroup.compareTo(to.readGroup);
|
||||
if (cmpReadGroup != 0) return cmpReadGroup;
|
||||
|
||||
int cmpPos = new Integer(pos).compareTo(to.pos);
|
||||
if (cmpPos != 0) return cmpPos;
|
||||
|
||||
int cmpQual = new Integer(qual).compareTo(to.qual);
|
||||
if (cmpQual != 0) return cmpQual;
|
||||
|
||||
return dinuc.compareTo(to.dinuc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the expected number of mismatching bases for this datum, using the number of bases N and
|
||||
* the reported qual score. Effectively qual implied error rate * number of bases.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getExpectedErrors() {
|
||||
return QualityUtils.qualToErrorProb((byte)qual) * N;
|
||||
}
|
||||
|
||||
//
|
||||
// Increment routines
|
||||
//
|
||||
public RecalData inc(long incN, long incB) {
|
||||
N += incN;
|
||||
B += incB;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add N and B from other to self
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public RecalData inc(final RecalData other) {
|
||||
return inc(other.N, other.B);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all of the Ns and Bs for data to self
|
||||
*
|
||||
* @param data
|
||||
* @return
|
||||
*/
|
||||
public RecalData inc(List<RecalData> data) {
|
||||
for ( RecalData other : data ) {
|
||||
this.inc(other);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count an empirical observation of our current base against the reference base.
|
||||
* Increments N and, if curBase != ref, also increments B
|
||||
*
|
||||
* @param curBase
|
||||
* @param ref
|
||||
* @return
|
||||
*/
|
||||
public RecalData inc(char curBase, char ref) {
|
||||
return inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
|
||||
//out.printf("%s %s\n", curBase, ref);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the integer dinuc index associated with this datum. Can return -1 if dinuc
|
||||
* contains unexpected bases. Returns 0 if dinuc contains the * operator (not tracking dinucs)
|
||||
* @return
|
||||
*/
|
||||
public int getDinucIndex() {
|
||||
return dinucIndex(this.dinuc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the empirical quality score for this datum from B and N. Bounds the result
|
||||
* by QualityUtils.MAX_REASONABLE_Q_SCORE.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double empiricalQualDouble(final boolean useRawQempirical) {
|
||||
double doubleB = useRawQempirical ? B : B + 1;
|
||||
double doubleN = useRawQempirical ? N : N + 1;
|
||||
double empiricalQual = -10 * Math.log10(doubleB / doubleN);
|
||||
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
return empiricalQual;
|
||||
}
|
||||
public double empiricalQualDouble() { return empiricalQualDouble(true); }
|
||||
|
||||
|
||||
/**
|
||||
* As as empiricalQualDouble, but rounded and encoded as a byte for placement in a read quality array of bytes
|
||||
* @return
|
||||
*/
|
||||
public byte empiricalQualByte(final boolean useRawQempirical) {
|
||||
double doubleB = useRawQempirical ? B : B + 1;
|
||||
double doubleN = useRawQempirical ? N : N + 1;
|
||||
return QualityUtils.probToQual(1.0 - doubleB / doubleN);
|
||||
}
|
||||
public byte empiricalQualByte() { return empiricalQualByte(true); }
|
||||
|
||||
|
||||
public static String headerString() {
|
||||
return ("pos, rg, dinuc, qual, emp_qual, qual_diff, n, b");
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("[rg=%s, pos=%3d, Qrep=%3d, dinuc=%s, Qemp=%2.2f / %2.2f, B=%d, N=%d]", readGroup, pos, qual, dinuc, empiricalQualDouble(), empiricalQualDouble(false), B, N);
|
||||
//return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||
}
|
||||
|
||||
//
|
||||
// To and from CSV strings
|
||||
//
|
||||
public String toCSVString(boolean collapsedPos) {
|
||||
// rg,pos,Qrep,dn,nBases,nMismatches,Qemp
|
||||
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, collapsedPos ? "*" : pos, qual, dinuc, N, B, empiricalQualByte());
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the string s and returns the encoded RecalData in it. Assumes data has been emitted by
|
||||
* toCSVString() routine. Order is readGroup, dinuc, qual, pos, N, B, Qemp as byte
|
||||
* @param s
|
||||
* @return
|
||||
*/
|
||||
public static RecalData fromCSVString(String s) throws NumberFormatException {
|
||||
String[] vals = s.split(",");
|
||||
String rg = vals[0];
|
||||
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
|
||||
int qual = Integer.parseInt(vals[2]);
|
||||
String dinuc = vals[3];
|
||||
int N = Integer.parseInt(vals[4]);
|
||||
int B = Integer.parseInt(vals[5]);
|
||||
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
||||
datum.B = B;
|
||||
datum.N = N;
|
||||
|
||||
// Checking for badness
|
||||
if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos);
|
||||
if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B);
|
||||
if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N);
|
||||
if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual);
|
||||
|
||||
return datum;
|
||||
}
|
||||
|
||||
|
||||
public static List<RecalData> sort(List<RecalData> l) {
|
||||
Collections.sort(l);
|
||||
return l;
|
||||
}
|
||||
|
||||
public static int bases2dinucIndex(char prevBase, char base) {
|
||||
int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
|
||||
int bI = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
return (pbI == -1 || bI == -1) ? -1 : pbI * 4 + bI;
|
||||
}
|
||||
|
||||
public final static int NDINUCS = 16;
|
||||
public static String dinucIndex2bases(int index) {
|
||||
char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)};
|
||||
return new String( data );
|
||||
}
|
||||
|
||||
public static int dinucIndex(String s) {
|
||||
return bases2dinucIndex(s.charAt(0), s.charAt(1));
|
||||
}
|
||||
|
||||
public static int dinucIndex(byte prevBase, byte base) {
|
||||
return bases2dinucIndex((char)prevBase, (char)base);
|
||||
}
|
||||
|
||||
public static double combinedQreported(List<RecalData> l) {
|
||||
double sumExpectedErrors = 0;
|
||||
long nBases = 0;
|
||||
for ( RecalData datum : l ) {
|
||||
sumExpectedErrors += datum.getExpectedErrors();
|
||||
nBases += datum.N;
|
||||
}
|
||||
|
||||
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
|
||||
//System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
|
||||
// sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,195 +1,204 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Jun 16, 2009
|
||||
* Time: 9:55:10 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* User: rpoplin
|
||||
* Date: Nov 6, 2009
|
||||
*
|
||||
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
|
||||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
/** The original quality scores are stored in this attribute */
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
|
||||
public NHashMap<RecalDatum> data; // the full dataset
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
|
||||
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||
boolean trackPos, trackDinuc;
|
||||
String readGroup;
|
||||
int nDinucs, maxReadLen;
|
||||
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
//RecalData[][][] data = null;
|
||||
ExpandingArrayList<ExpandingArrayList<ExpandingArrayList<RecalData>>> data = null;
|
||||
|
||||
public RecalDataManager(String readGroup,
|
||||
//int maxReadLen, int maxQual, int nDinucs,
|
||||
boolean trackPos, boolean trackDinuc) {
|
||||
data = new ExpandingArrayList<ExpandingArrayList<ExpandingArrayList<RecalData>>>();
|
||||
//data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
|
||||
|
||||
this.readGroup = readGroup;
|
||||
this.trackPos = trackPos;
|
||||
this.trackDinuc = trackDinuc;
|
||||
//this.maxReadLen = maxReadLen;
|
||||
//this.nDinucs = nDinucs;
|
||||
RecalDataManager() {
|
||||
data = new NHashMap<RecalDatum>();
|
||||
}
|
||||
|
||||
public int getMaxReadLen() {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
public int getPosIndex(int pos) {
|
||||
return trackPos ? pos : 0;
|
||||
}
|
||||
|
||||
public int canonicalPos(int cycle) {
|
||||
return getPosIndex(cycle);
|
||||
}
|
||||
|
||||
|
||||
public int getDinucIndex(String dinuc) {
|
||||
return trackDinuc ? RecalData.dinucIndex(dinuc) : 0;
|
||||
}
|
||||
|
||||
public int getDinucIndex(byte prevBase, byte base) {
|
||||
return trackDinuc ? RecalData.dinucIndex(prevBase, base) : 0;
|
||||
}
|
||||
|
||||
public String canonicalDinuc(String dinuc) {
|
||||
return trackDinuc ? dinuc : "**";
|
||||
}
|
||||
|
||||
public void addDatum(RecalData datum) {
|
||||
if ( ! datum.readGroup.equals(this.readGroup) ) {
|
||||
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
|
||||
}
|
||||
|
||||
RecalData prev = getRecalData(datum.pos, datum.qual, datum.dinuc);
|
||||
if ( prev != null ) {
|
||||
if ( trackDinuc && trackPos )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
|
||||
prev.inc(datum);
|
||||
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // initialize all the collapsed tables
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
}
|
||||
dataSumExpectedErrors = new NHashMap<Double>();
|
||||
} else {
|
||||
int posIndex = getPosIndex(datum.pos);
|
||||
int internalDinucIndex = getDinucIndex(datum.dinuc);
|
||||
if ( internalDinucIndex == -1 ) return;
|
||||
|
||||
set(posIndex, datum.qual, internalDinucIndex, datum);
|
||||
flattenData.add(datum);
|
||||
}
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
|
||||
}
|
||||
}
|
||||
|
||||
public RecalData getRecalData(int pos, int qual, String dinuc) {
|
||||
return expandingGetRecalData(pos, qual, dinuc, false);
|
||||
}
|
||||
|
||||
public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) {
|
||||
int posIndex = getPosIndex(pos);
|
||||
int internalDinucIndex = getDinucIndex(dinuc);
|
||||
|
||||
if ( internalDinucIndex == -1 ) return null;
|
||||
|
||||
RecalData datum = get(posIndex, qual, internalDinucIndex);
|
||||
if ( datum == null && expandP ) {
|
||||
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
|
||||
datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**");
|
||||
set(posIndex, qual, internalDinucIndex, datum);
|
||||
flattenData.add(datum);
|
||||
}
|
||||
|
||||
return datum;
|
||||
RecalDataManager( final int estimatedCapacity ) {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
|
||||
// a number to monkey around with when optimizing performace of the HashMap
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the RecalData associated with pos, qual, dinuc, or null if not is bound.
|
||||
* Does not expand the system to corporate a new data point
|
||||
* @param posIndex
|
||||
* @param qual
|
||||
* @param dinucIndex
|
||||
* @return
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
*/
|
||||
private RecalData get(int posIndex, int qual, int dinucIndex) {
|
||||
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
|
||||
if (d2 == null) return null;
|
||||
ExpandingArrayList<RecalData> d3 = d2.get(qual);
|
||||
return d3 == null ? null : d3.get(dinucIndex);
|
||||
}
|
||||
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
|
||||
|
||||
private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
|
||||
// grow data if necessary
|
||||
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
|
||||
if (d2 == null) {
|
||||
d2 = new ExpandingArrayList<ExpandingArrayList<RecalData>>();
|
||||
data.set(posIndex, d2);
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// Expand d2 if necessary
|
||||
ExpandingArrayList<RecalData> d3 = d2.get(qual);
|
||||
if ( d3 == null ) {
|
||||
d3 = new ExpandingArrayList<RecalData>();
|
||||
d2.set(qual, d3);
|
||||
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
|
||||
if( sumExpectedErrors == null ) {
|
||||
dataSumExpectedErrors.put( newKey, 0.0 );
|
||||
} else {
|
||||
dataSumExpectedErrors.remove( newKey );
|
||||
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
|
||||
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
|
||||
}
|
||||
|
||||
// set d3 to datum
|
||||
d3.set(dinucIndex, datum);
|
||||
}
|
||||
newKey = new ArrayList<Comparable>();
|
||||
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score
|
||||
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
private RecalData getMatchingDatum(List<RecalData> l, RecalData datum,
|
||||
boolean combinePos, boolean combineQual, boolean combineDinuc) {
|
||||
for ( RecalData find : l ) {
|
||||
if ( (combineQual || find.qual == datum.qual) &&
|
||||
(combinePos || find.pos == datum.pos ) &&
|
||||
(combineDinuc || find.dinuc.equals(datum.dinuc)) ) {
|
||||
//System.out.printf("Found %s for %s%n", find, datum);
|
||||
return find;
|
||||
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score ...
|
||||
newKey.add( key.get(iii + 2) ); // and the given covariate
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
}
|
||||
RecalData d = new RecalData(combinePos ? 0 : datum.pos, combineQual ? 0 : datum.qual, datum.readGroup, combineDinuc ? "**" : datum.dinuc );
|
||||
//System.out.printf("Making new match %s%n", d);
|
||||
l.add(d);
|
||||
return d;
|
||||
}
|
||||
|
||||
public List<RecalData> combineDinucs() {
|
||||
return combine(false, false, true);
|
||||
}
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
* @param numCovariates The number of covariates you have determines the number of tables to create
|
||||
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
|
||||
*/
|
||||
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
|
||||
|
||||
public List<RecalData> combineCycles() {
|
||||
return combine(true, false, false);
|
||||
}
|
||||
|
||||
public List<RecalData> combine(boolean ignorePos, boolean ignoreQual, boolean ignoreDinuc) {
|
||||
List<RecalData> l = new ArrayList<RecalData>();
|
||||
for ( RecalData datum : flattenData ) {
|
||||
RecalData reduced = getMatchingDatum(l, datum, ignorePos, ignoreQual, ignoreDinuc);
|
||||
//System.out.printf("Combining %s with %s%n", datum, reduced);
|
||||
reduced.inc(datum);
|
||||
dataCollapsedReadGroupDouble = new NHashMap<Double>();
|
||||
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
|
||||
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
public List<RecalData> getAll() {
|
||||
return flattenData;
|
||||
}
|
||||
|
||||
// we can process the OQ field if requested
|
||||
public static byte[] getQualsForRecalibration( SAMRecord read, boolean useOriginalQuals ) {
|
||||
byte[] quals = read.getBaseQualities();
|
||||
if ( useOriginalQuals && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
|
||||
// Looping over the entrySet is really expensive but worth it
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
|
||||
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
}
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
|
||||
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
}
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
|
||||
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
}
|
||||
}
|
||||
|
||||
return quals;
|
||||
dataCollapsedQualityScore.clear();
|
||||
dataCollapsedByCovariate.clear();
|
||||
dataCollapsedQualityScore = null; // will never need this again
|
||||
dataCollapsedByCovariate = null; // will never need this again
|
||||
if( data!=null ) {
|
||||
data.clear();
|
||||
data = null; // will never need this again
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
|
||||
* @return The desired collapsed NHashMap<Double>
|
||||
*/
|
||||
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
|
@ -1,3 +1,22 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
|
|
@ -23,311 +42,408 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
|
||||
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
|
||||
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
|
||||
* This walker then outputs a new bam file with these updated (recalibrated) reads.
|
||||
*
|
||||
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
|
||||
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
|
||||
*/
|
||||
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({DataSource.READS}) // , DataSource.REFERENCE})
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the
|
||||
// GATK to not actually spend time giving me the refBase array since I don't use it
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
@Argument(fullName="params", shortName="params", doc="CountCovariates params file", required=true)
|
||||
public String paramsFile;
|
||||
|
||||
@Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=true)
|
||||
public SAMFileWriter outputBam = null;
|
||||
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
|
||||
private String RECAL_FILE = null;
|
||||
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
|
||||
private SAMFileWriter OUTPUT_BAM = null;
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
|
||||
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 3;
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
private int SMOOTHING = 1;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
|
||||
private boolean USE_SLX_PLATFORM = false;
|
||||
|
||||
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
|
||||
public boolean rawQempirical = false;
|
||||
//public enum RecalibrationMode {
|
||||
// COMBINATORIAL,
|
||||
// SEQUENTIAL,
|
||||
// ERROR
|
||||
//}
|
||||
|
||||
@Argument(fullName="preserveQScoresLessThan", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
public int preserveQScoresLessThan = 5;
|
||||
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
||||
private String MODE_STRING = "SEQUENTIAL";
|
||||
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
||||
|
||||
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
public boolean useOriginalQuals = false;
|
||||
|
||||
//
|
||||
// Basic static information
|
||||
//
|
||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||
private static String VERSION = "0.2.5";
|
||||
|
||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||
|
||||
public enum RecalibrationMode {
|
||||
COMBINATORIAL,
|
||||
JAFFE,
|
||||
BY_POS_ONLY,
|
||||
BY_DINUC_ONLY,
|
||||
SEQUENTIAL,
|
||||
ERROR
|
||||
}
|
||||
|
||||
@Argument(shortName="mode", doc="", required=false)
|
||||
public String modeString = RecalibrationMode.SEQUENTIAL.toString();
|
||||
public RecalibrationMode mode = RecalibrationMode.ERROR;
|
||||
private RecalDataManager dataManager;
|
||||
private ArrayList<Covariate> requestedCovariates;
|
||||
private ArrayList<Comparable> fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values
|
||||
private ArrayList<Comparable> collapsedTableKey; // the key that will be used over and over again to query the collapsed tables
|
||||
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
|
||||
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
||||
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
|
||||
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Read in the recalibration table input file.
|
||||
* Parse the list of covariate classes used during CovariateCounterWalker.
|
||||
* Parse the CSV data and populate the hashmap.
|
||||
*/
|
||||
public void initialize() {
|
||||
logger.info("TableRecalibrator version: " + VERSION);
|
||||
|
||||
//
|
||||
// crappy hack until Enum arg types are supported
|
||||
//
|
||||
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
|
||||
if ( potential.toString().equals(modeString)) {
|
||||
mode = potential;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( mode == RecalibrationMode.ERROR )
|
||||
throw new RuntimeException("Unknown mode requested: " + modeString);
|
||||
|
||||
logger.info( "TableRecalibrationWalker version: " + versionNumber );
|
||||
|
||||
// Get a list of all available covariates
|
||||
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
|
||||
//
|
||||
// Walk over the data file, parsing it into recalibration data
|
||||
//
|
||||
int lineNumber = 0;
|
||||
try {
|
||||
logger.info(String.format("Reading data..."));
|
||||
List<RecalData> data = new ArrayList<RecalData>();
|
||||
boolean collapsedPos = false;
|
||||
boolean collapsedDinuc = false;
|
||||
boolean foundAllCovariates = false;
|
||||
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
|
||||
|
||||
//List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
||||
for ( String line : new xReadLines(new File(paramsFile)) ) {
|
||||
// Warn the user if a dbSNP file was specified since it isn't being used here
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
if( foundDBSNP ) {
|
||||
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
|
||||
}
|
||||
|
||||
fullCovariateKey = new ArrayList<Comparable>();
|
||||
collapsedTableKey = new ArrayList<Comparable>();
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
// Read in the data from the csv file and populate the map
|
||||
logger.info( "Reading in the data from input file..." );
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if ( HEADER_PATTERN.matcher(line).matches() )
|
||||
continue;
|
||||
if ( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
|
||||
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else {
|
||||
data.add(RecalData.fromCSVString(line));
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
|
||||
} else { // found another covariate in input file
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
|
||||
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
// some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // found some data
|
||||
if( !foundAllCovariates ) {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
}
|
||||
foundAllCovariates = true;
|
||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||
final boolean createCollapsedTables = true;
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
initializeCache(data, collapsedPos, collapsedDinuc);
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
|
||||
Utils.scareUser("Can not find input file: " + RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e);
|
||||
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
|
||||
private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
|
||||
Matcher m = pat.matcher(line);
|
||||
if ( m.matches() ) {
|
||||
flag = Boolean.parseBoolean(m.group(1));
|
||||
}
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
|
||||
return flag;
|
||||
}
|
||||
|
||||
private void initializeCache(List<RecalData> data, boolean collapsedPos, boolean collapsedDinuc ) {
|
||||
Set<String> readGroups = new HashSet<String>();
|
||||
Set<String> dinucs = new HashSet<String>();
|
||||
int maxPos = -1;
|
||||
int maxQReported = -1;
|
||||
|
||||
logger.info(String.format("No. params : %d", data.size()));
|
||||
// get the maximum data from the file
|
||||
for ( RecalData datum : data ) {
|
||||
readGroups.add(datum.readGroup);
|
||||
dinucs.add(datum.dinuc);
|
||||
maxPos = Math.max(maxPos, datum.pos);
|
||||
maxQReported = Math.max(maxQReported, datum.qual);
|
||||
}
|
||||
logger.info(String.format("Read groups : %d %s", readGroups.size(), readGroups.toString()));
|
||||
logger.info(String.format("Dinucs : %d %s", dinucs.size(), dinucs.toString()));
|
||||
logger.info(String.format("Max pos : %d", maxPos));
|
||||
logger.info(String.format("Max Q reported : %d", maxQReported));
|
||||
|
||||
// check the mode
|
||||
switch ( mode ) {
|
||||
case JAFFE:
|
||||
collapsedPos = collapsedDinuc = true;
|
||||
break;
|
||||
case BY_POS_ONLY:
|
||||
if ( collapsedPos )
|
||||
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
collapsedPos = true;
|
||||
throw new RuntimeException("Unsupported mode requested, sorry");
|
||||
//break;
|
||||
case BY_DINUC_ONLY:
|
||||
if ( collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
collapsedDinuc = true;
|
||||
throw new RuntimeException("Unsupported mode requested, sorry");
|
||||
//break;
|
||||
case COMBINATORIAL:
|
||||
if ( collapsedPos || collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
case SEQUENTIAL:
|
||||
if ( collapsedPos || collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform sequential recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
}
|
||||
|
||||
//
|
||||
// initialize the data structures
|
||||
//
|
||||
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
|
||||
for ( String readGroup : readGroups ) {
|
||||
// create a manager for each read group
|
||||
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc);
|
||||
managers.put(readGroup, manager);
|
||||
}
|
||||
|
||||
// add the data to their appropriate readGroup managers
|
||||
for ( RecalData datum : data ) {
|
||||
managers.get(datum.readGroup).addDatum(datum);
|
||||
}
|
||||
|
||||
// fill in the table with mapping objects
|
||||
for ( String readGroup : readGroups ) {
|
||||
RecalDataManager manager = managers.get(readGroup);
|
||||
RecalMapping mapper = initializeMapper(manager, rawQempirical, dinucs, maxPos, maxQReported);
|
||||
logger.info(String.format("Creating mapper for %s of class %s", readGroup, mapper));
|
||||
cache.put(readGroup, mapper);
|
||||
// Create the collapsed tables that are used in the sequential calculation
|
||||
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||
logger.info( "...done!" );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new RecalMapping object appropriate for the requested mode in this.mode for the RecalData
|
||||
* in manager. Uses the dinucs set dinucs, maxPos, and maxQReported to build efficient lookup structures
|
||||
* for mapping from old qual and features -> new qual
|
||||
*
|
||||
* @param manager
|
||||
* @param dinucs
|
||||
* @param maxPos
|
||||
* @param maxQReported
|
||||
* @return
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private RecalMapping initializeMapper( RecalDataManager manager, final boolean rawQempirical,
|
||||
Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
switch ( mode ) {
|
||||
case COMBINATORIAL:
|
||||
case JAFFE:
|
||||
return new CombinatorialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
|
||||
case SEQUENTIAL:
|
||||
return new SerialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
|
||||
default:
|
||||
throw new RuntimeException("Unimplemented recalibration mode: " + mode);
|
||||
private void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
ArrayList<Comparable> key = new ArrayList<Comparable>();
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( iii == 1 ) { // order is different in the old recalibrator
|
||||
key.add( cov.getValue( vals[2] ) );
|
||||
} else if ( iii == 2 ) {
|
||||
key.add( cov.getValue( vals[1] ) );
|
||||
} else {
|
||||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
} else {
|
||||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
}
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
|
||||
dataManager.addToAllTables( key, datum );
|
||||
|
||||
}
|
||||
|
||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
||||
* @param refBases References bases over the length of the read
|
||||
* @param read The read to be recalibrated
|
||||
* @return The read with quality scores replaced
|
||||
*/
|
||||
public SAMRecord map( char[] refBases, SAMRecord read ) {
|
||||
|
||||
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
|
||||
// This is done in order to speed up the code
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
originalQuals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// These calls are all expensive so only do them once for each read
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
final String readGroupId = readGroup.getReadGroupId();
|
||||
String platform = readGroup.getPlatform();
|
||||
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
platform = "ILLUMINA";
|
||||
}
|
||||
byte[] bases = read.getReadBases();
|
||||
int startPos = 1;
|
||||
int stopPos = bases.length;
|
||||
|
||||
|
||||
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
|
||||
|
||||
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
bases = BaseUtils.simpleReverseComplement(bases);
|
||||
quals = BaseUtils.reverse(quals);
|
||||
if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
|
||||
startPos = 0;
|
||||
stopPos = bases.length - 1;
|
||||
}
|
||||
|
||||
try {
|
||||
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
|
||||
|
||||
// Don't change Q scores below some threshold
|
||||
preserveQScores(quals, recalQuals, read);
|
||||
// For each base in the read
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) { // reverse the quals for the neg strand read
|
||||
recalQuals = BaseUtils.reverse(recalQuals);
|
||||
quals = BaseUtils.reverse(quals);
|
||||
}
|
||||
|
||||
read.setBaseQualities(recalQuals);
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) {
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(quals));
|
||||
// Get the covariate values which make up the key
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here
|
||||
}
|
||||
|
||||
return read;
|
||||
} catch ( StingException e ) {
|
||||
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
|
||||
recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey );
|
||||
fullCovariateKey.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
|
||||
for ( int i = 0; i < recalQuals.length; i++ ) {
|
||||
if ( originalQuals[i] < preserveQScoresLessThan ) {
|
||||
//System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
|
||||
recalQuals[i] = originalQuals[i];
|
||||
}
|
||||
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
|
||||
if( platform.equalsIgnoreCase("SOLID") ) {
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
|
||||
}
|
||||
|
||||
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Workhorse routine. Given a read group and an array of bases and quals, returns a new set of recalibrated
|
||||
* qualities for each base/qual in bases and quals. Uses the RecalMapping object associated with readGroup.
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param bases
|
||||
* @param quals
|
||||
* @return
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) throws StingException {
|
||||
if ( readGroup == null ) { throw new StingException("BUG: read group is null"); }
|
||||
if ( bases == null ) { throw new StingException("BUG: bases array is null"); }
|
||||
if ( bases.length == 0) { throw new StingException("BUG: bases array is size 0"); }
|
||||
if ( quals == null ) { throw new StingException("BUG: quals array is null"); }
|
||||
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
|
||||
|
||||
byte[] recalQuals = new byte[quals.length];
|
||||
RecalMapping mapper = cache.get(readGroup);
|
||||
if ( mapper == null ) {
|
||||
// throw new StingException(String.format("BUG: couldn't find RecalMapping for readgroup %s", readGroup));
|
||||
// Edited by ebanks on 8/9/09
|
||||
// Actually, there is a valid case when the mapper == null: when all reads in a RG are
|
||||
// unmapped or have mapping quality == 0. In this case, we don't want to die - nor do we
|
||||
// want to lose these reads - so we just don't alter the quals
|
||||
for ( int cycle = 0; cycle < bases.length; cycle++ )
|
||||
recalQuals[cycle] = quals[cycle];
|
||||
return recalQuals;
|
||||
String readGroupKeyElement = key.get(0).toString();
|
||||
int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
|
||||
byte qualFromRead = (byte)qualityScoreKeyElement;
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
collapsedTableKey.add( readGroupKeyElement );
|
||||
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey);
|
||||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey);
|
||||
double globalDeltaQ = 0.0;
|
||||
double aggregrateQreported = 0.0;
|
||||
if( globalDeltaQDatum != null ) {
|
||||
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) );
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
|
||||
}
|
||||
|
||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
||||
byte qual = quals[cycle];
|
||||
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
||||
|
||||
if ( newQual <= 0 || newQual > QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||
throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual));
|
||||
|
||||
recalQuals[cycle] = newQual;
|
||||
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
||||
// The shift in quality between reported and empirical
|
||||
collapsedTableKey.add( qualityScoreKeyElement );
|
||||
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey);
|
||||
double deltaQReported = 0.0;
|
||||
if( deltaQReportedEmpirical != null ) {
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
Double deltaQCovariateEmpirical;
|
||||
double deltaQPos = 0.0;
|
||||
double deltaQDinuc = 0.0;
|
||||
for( int iii = 2; iii < key.size(); iii++ ) {
|
||||
collapsedTableKey.add( key.get(iii) ); // the given covariate
|
||||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
|
||||
if( deltaQCovariateEmpirical != null ) {
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator
|
||||
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
|
||||
}
|
||||
}
|
||||
collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list
|
||||
}
|
||||
|
||||
return recalQuals;
|
||||
}
|
||||
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
byte newQualityByte = QualityUtils.boundQual( (int)Math.round(newQuality), QualityUtils.MAX_REASONABLE_Q_SCORE );
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
return outputBam;
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
collapsedTableKey.clear();
|
||||
return newQualityByte;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the read
|
||||
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param colorSpaceQuals The list of color space quality scores for this read
|
||||
*/
|
||||
private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Nothing start the reduce with a new handle to the output bam file
|
||||
* @return A FileWriter pointing to a new bam file
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
return OUTPUT_BAM;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output each read to disk
|
||||
* @param read The read to output
|
||||
* @param output The FileWriter to write the read to
|
||||
* @return The FileWriter
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
|
|
@ -337,191 +453,3 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
interface RecalMapping {
|
||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual);
|
||||
}
|
||||
|
||||
class CombinatorialRecalMapping implements RecalMapping {
|
||||
ArrayList<byte[][]> cache;
|
||||
RecalDataManager manager;
|
||||
|
||||
public CombinatorialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
||||
Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
this.manager = manager;
|
||||
|
||||
// initialize the data structure
|
||||
cache = new ArrayList<byte[][]>(RecalData.NDINUCS);
|
||||
for ( String dinuc : dinucs ) {
|
||||
cache.add(new byte[maxPos+1][maxQReported+1]);
|
||||
}
|
||||
|
||||
for ( RecalData datum : manager.getAll() ) {
|
||||
//System.out.printf("Adding datum %s%n", datum);
|
||||
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
|
||||
int pos = manager.canonicalPos(datum.pos);
|
||||
if ( table[pos][datum.qual] != 0 )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
|
||||
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
|
||||
}
|
||||
}
|
||||
|
||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||
int pos = manager.canonicalPos(cycle);
|
||||
int index = this.manager.getDinucIndex(prevBase, base);
|
||||
byte[][] dataTable = index == -1 ? null : cache.get(index);
|
||||
|
||||
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
||||
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
|
||||
|
||||
return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table. First,
|
||||
* we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc)
|
||||
*/
|
||||
class SerialRecalMapping implements RecalMapping {
|
||||
private double globalDeltaQ = 0.0;
|
||||
private double[][] deltaQPosMap, deltaQDinucMap;
|
||||
double [] deltaQualMap;
|
||||
RecalData [][] qPosSupports = null, qDinucSupports = null;
|
||||
|
||||
CombinatorialRecalMapping combiMap;
|
||||
RecalDataManager manager;
|
||||
|
||||
String dinucToLookAt = null; // "CC";
|
||||
int posToLookAt = -1;
|
||||
int qualToLookAt = 25;
|
||||
|
||||
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
||||
Set<String> dinucs, int maxPos, int maxQReported ) {
|
||||
//combiMap = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported );
|
||||
this.manager = manager;
|
||||
|
||||
// initialize the data structure
|
||||
{
|
||||
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
|
||||
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
|
||||
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
|
||||
//System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
|
||||
}
|
||||
|
||||
for ( RecalData datum : manager.getAll() ) {
|
||||
if ( printStateP(datum) )
|
||||
System.out.printf("PrintValue: %s%n", datum);
|
||||
}
|
||||
|
||||
// Jaffe-style
|
||||
deltaQualMap = new double[maxQReported+1];
|
||||
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
|
||||
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
|
||||
//System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
|
||||
}
|
||||
|
||||
// calculate the delta Q pos array
|
||||
deltaQPosMap = new double[maxPos+1][maxQReported+1];
|
||||
//qPosSupports = new RecalData[maxPos+1][maxQReported+1];
|
||||
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
|
||||
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
|
||||
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
|
||||
}
|
||||
|
||||
// calculate the delta Q dinuc array
|
||||
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
|
||||
//qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
|
||||
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
|
||||
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
|
||||
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
|
||||
}
|
||||
|
||||
validateTables(maxPos, maxQReported, dinucs.size(), deltaQPosMap, deltaQDinucMap, useRawQempirical, manager.getAll());
|
||||
}
|
||||
|
||||
|
||||
private void validateTables(int maxPos, int maxQReported, int nDinucs,
|
||||
double[][] deltaQPosMap, double[][] deltaQDinucMap,
|
||||
final boolean useRawQempirical, List<RecalData> data ) {
|
||||
for ( int i = 0; i < maxPos; i++ ) {
|
||||
for ( int j = 0; j < maxQReported; j++ ) {
|
||||
if ( printStateP(i, null, j) )
|
||||
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
|
||||
i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null);
|
||||
}
|
||||
}
|
||||
|
||||
for ( int i = 0; i < nDinucs; i++ ) {
|
||||
for ( int j = 0; j < maxQReported; j++ ) {
|
||||
String dinuc = RecalData.dinucIndex2bases(i);
|
||||
if ( printStateP(0, dinuc, j ) )
|
||||
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
|
||||
dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null);
|
||||
}
|
||||
}
|
||||
|
||||
for ( RecalData datum : RecalData.sort(data) ) {
|
||||
byte newQual = getNewQual(datum.readGroup, (byte)datum.dinuc.charAt(0), (byte)datum.dinuc.charAt(1), datum.pos, (byte)datum.qual);
|
||||
if ( printStateP( datum ) ) {
|
||||
System.out.printf("Serial mapping %s => %d => %d vs. %d => delta = %d%n",
|
||||
datum, newQual, datum.empiricalQualByte(useRawQempirical), datum.empiricalQualByte(), newQual - datum.empiricalQualByte(useRawQempirical));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void updateCache( RecalData[][] supports, RecalData datum, final boolean useRawQempirical,
|
||||
double[][] table, int i, int j, double meanQ ) {
|
||||
if ( table[i][j] != 0 )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
|
||||
table[i][j] = deltaQ;
|
||||
if ( supports != null )
|
||||
supports[i][j] = datum;
|
||||
}
|
||||
|
||||
private boolean printStateP( int cycle, String dinuc, int qual ) {
|
||||
return posToLookAt != -1 &&
|
||||
( cycle == 0 || posToLookAt == 0 || cycle == posToLookAt ) &&
|
||||
( dinucToLookAt == null || dinuc == null || manager.getDinucIndex(dinuc) == -1 || dinucToLookAt.equals(dinuc)) &&
|
||||
( qualToLookAt == 0 || qual == qualToLookAt );
|
||||
}
|
||||
|
||||
private boolean printStateP( RecalData datum ) {
|
||||
return printStateP(datum.pos, datum.dinuc, datum.qual);
|
||||
}
|
||||
|
||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||
int pos = manager.canonicalPos(cycle);
|
||||
int index = this.manager.getDinucIndex(prevBase, base);
|
||||
|
||||
byte newQualByte = qual;
|
||||
if ( qual < deltaQualMap.length ) {
|
||||
// it's possible to see a qual not in the recal table -- i.e., we only see Q28 at a dbSNP site
|
||||
double deltaQual = deltaQualMap[qual];
|
||||
double deltaQPos = pos >= deltaQPosMap.length ? 0.0 : deltaQPosMap[pos][qual]; // == length indices no data for last base
|
||||
double deltaQDinuc = index == -1 ? 0.0 : deltaQDinucMap[index][qual]; // -1 indices N in prev or current base
|
||||
double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc;
|
||||
newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE);
|
||||
|
||||
//System.out.println(String.format("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc, newQualByte));
|
||||
|
||||
if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||
throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d",
|
||||
readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte));
|
||||
}
|
||||
|
||||
return newQualByte;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,484 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* This walker is designed to work as the first pass in a two-pass processing step.
|
||||
* It does a by-locus traversal operating only at sites that are not in dbSNP.
|
||||
* We assume that all mismatches we see are therefore errors.
|
||||
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
|
||||
* Since there is a large amount of data one can then calculate an empirical probability of error
|
||||
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
|
||||
* The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score)
|
||||
* The first lines of the output file give the name of the covariate classes that were used for this calculation.
|
||||
*
|
||||
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list.
|
||||
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
|
||||
*/
|
||||
|
||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||
@WalkerName( "CountCovariatesRefactored" )
|
||||
//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests
|
||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
private Boolean LIST_ONLY = false;
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 3;
|
||||
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
|
||||
private boolean NO_PRINT_HEADER = false;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
|
||||
private boolean USE_SLX_PLATFORM = false;
|
||||
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
|
||||
private int sizeOfReadDatumHashMap = 0;
|
||||
|
||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
|
||||
//private final String versionNumber = "2.0.0"; // major version, minor version, and build number
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Parse the -cov arguments and create a list of covariates to be used here
|
||||
* Based on the covariates' estimates for initial capacity allocate the data hashmap
|
||||
*/
|
||||
public void initialize() {
|
||||
|
||||
//logger.info( "CovariateCounterWalker version: " + versionNumber );
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
out.println( "Available covariates:" );
|
||||
for( Class<?> covClass : classes ) {
|
||||
out.println( covClass.getSimpleName() );
|
||||
}
|
||||
out.println();
|
||||
|
||||
System.exit( 0 ); // early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file was specified
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
if( !foundDBSNP ) {
|
||||
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
|
||||
}
|
||||
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
} else if( COVARIATES != null ) {
|
||||
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
for( Class<?> covClass : classes ) {
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
|
||||
requestedCovariates.add( covariate );
|
||||
}
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
} else { // The user has specified a list of several covariates
|
||||
int covNumber = 1;
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
|
||||
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
|
||||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
|
||||
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
|
||||
}
|
||||
covNumber++;
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // No covariates were specified by the user so add the default, required ones
|
||||
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
estimatedCapacity = 300 * 40;
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
|
||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||
dataManager = new RecalDataManager( estimatedCapacity );
|
||||
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
* @param tracker The reference metadata tracker
|
||||
* @param ref The reference context
|
||||
* @param context The alignment context
|
||||
* @return Returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
// pull out anything passed by -B name,type,file that has the name "dbsnp"
|
||||
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
|
||||
boolean isSNP = false;
|
||||
if (dbsnpRODs != null) {
|
||||
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
|
||||
if( ((Variation)rod).isSNP() ) {
|
||||
isSNP = true; // at least one of the rods says this is a snp site
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
|
||||
if( !isSNP ) {
|
||||
final List<SAMRecord> reads = context.getReads();
|
||||
final List<Integer> offsets = context.getOffsets();
|
||||
SAMRecord read;
|
||||
int offset;
|
||||
String readGroupId;
|
||||
byte[] quals;
|
||||
byte[] bases;
|
||||
byte refBase;
|
||||
byte prevBase;
|
||||
String platform;
|
||||
byte[] colorSpaceQuals;
|
||||
ReadHashDatum readDatum;
|
||||
boolean isNegStrand;
|
||||
int mappingQuality;
|
||||
int length;
|
||||
|
||||
final int numReads = reads.size();
|
||||
// For each read at this locus
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
read = reads.get(iii);
|
||||
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
|
||||
|
||||
readDatum = readDatumHashMap.get( read );
|
||||
if( readDatum == null ) {
|
||||
|
||||
// If the HashMap of read objects has grown too large then throw out the (mostly stale) reads
|
||||
if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger?
|
||||
readDatumHashMap.clear();
|
||||
sizeOfReadDatumHashMap = 0;
|
||||
}
|
||||
|
||||
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
|
||||
quals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
|
||||
isNegStrand = read.getReadNegativeStrandFlag();
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
mappingQuality = read.getMappingQuality();
|
||||
length = bases.length;
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
platform = "ILLUMINA";
|
||||
}
|
||||
|
||||
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
|
||||
readDatumHashMap.put( read, readDatum );
|
||||
sizeOfReadDatumHashMap++;
|
||||
}
|
||||
|
||||
if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
|
||||
|
||||
// skip first and last base because there is no dinuc
|
||||
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
|
||||
if( offset > 0 ) {
|
||||
if( offset < readDatum.length - 1 ) {
|
||||
// skip if base quality is zero
|
||||
if( readDatum.quals[offset] > 0 ) {
|
||||
|
||||
refBase = (byte)ref.getBase();
|
||||
prevBase = readDatum.bases[offset-1];
|
||||
|
||||
// Get the complement base strand if we are a negative strand read
|
||||
if( readDatum.isNegStrand ) {
|
||||
prevBase = readDatum.bases[offset+1];
|
||||
}
|
||||
|
||||
// skip if this base or the previous one was an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
|
||||
colorSpaceQuals = null;
|
||||
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
|
||||
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
}
|
||||
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
|
||||
{
|
||||
// This base finally passed all the checks, so add it to the big hashmap
|
||||
updateDataFromRead( readDatum, offset, refBase );
|
||||
}
|
||||
} else {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
countedBases++; // replicating a small bug in the old recalibrator
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // at the last base in the read so we can remove it from our IdentityHashMap
|
||||
readDatumHashMap.remove( read );
|
||||
sizeOfReadDatumHashMap--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
countedSites++;
|
||||
|
||||
} else { // We skipped over the dbSNP site
|
||||
skippedSites++;
|
||||
}
|
||||
|
||||
return 1; // This value isn't actually used anywhere
|
||||
}
|
||||
|
||||
/**
|
||||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param readDatum The ReadHashDatum holding all the important properties of this read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) {
|
||||
|
||||
List<Comparable> key = new ArrayList<Comparable>();
|
||||
|
||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
key.add( covariate.getValue( readDatum, offset ) );
|
||||
}
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
RecalDatum datum = dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
dataManager.data.myPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
}
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
byte base = readDatum.bases[offset];
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
||||
countedBases++;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||
* @return returns A PrintStream created from the -rf filename
|
||||
*/
|
||||
public PrintStream reduceInit() {
|
||||
try {
|
||||
return new PrintStream( RECAL_FILE );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException( "Couldn't open output file: ", e );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
* @param value Result of the map. This value is immediately ignored.
|
||||
* @param recalTableStream The PrintStream used to output the CSV data
|
||||
* @return returns The PrintStream used to output the CSV data
|
||||
*/
|
||||
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
|
||||
return recalTableStream; // nothing to do here
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the full data hashmap to disk in CSV format
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
public void onTraversalDone( PrintStream recalTableStream ) {
|
||||
logger.info( "Writing raw recalibration data..." );
|
||||
outputToCSV( recalTableStream );
|
||||
logger.info( "...done!" );
|
||||
|
||||
recalTableStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( final PrintStream recalTableStream ) {
|
||||
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
recalTableStream.printf("# collapsed_pos false%n");
|
||||
recalTableStream.printf("# collapsed_dinuc false%n");
|
||||
recalTableStream.printf("# counted_sites %d%n", countedSites);
|
||||
recalTableStream.printf("# counted_bases %d%n", countedBases);
|
||||
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n");
|
||||
// For each entry in the data hashmap
|
||||
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.first ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.second.outputToCSV() );
|
||||
}
|
||||
} else {
|
||||
if( !NO_PRINT_HEADER ) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
|
||||
}
|
||||
}
|
||||
// For each entry in the data hashmap
|
||||
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getKey() ) {
|
||||
// Output the Covariate's value
|
||||
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.getValue().outputToCSV() );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,204 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 6, 2009
|
||||
*
|
||||
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
|
||||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
public NHashMap<RecalDatum> data; // the full dataset
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
|
||||
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
|
||||
|
||||
RecalDataManager() {
|
||||
data = new NHashMap<RecalDatum>();
|
||||
}
|
||||
|
||||
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // initialize all the collapsed tables
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
}
|
||||
dataSumExpectedErrors = new NHashMap<Double>();
|
||||
} else {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
|
||||
}
|
||||
}
|
||||
|
||||
RecalDataManager( final int estimatedCapacity ) {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
|
||||
// a number to monkey around with when optimizing performace of the HashMap
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
*/
|
||||
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with just the read group
|
||||
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
|
||||
if( sumExpectedErrors == null ) {
|
||||
dataSumExpectedErrors.put( newKey, 0.0 );
|
||||
} else {
|
||||
dataSumExpectedErrors.remove( newKey );
|
||||
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
|
||||
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
|
||||
}
|
||||
|
||||
newKey = new ArrayList<Comparable>();
|
||||
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score
|
||||
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score ...
|
||||
newKey.add( key.get(iii + 2) ); // and the given covariate
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
* @param numCovariates The number of covariates you have determines the number of tables to create
|
||||
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
|
||||
*/
|
||||
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
|
||||
|
||||
dataCollapsedReadGroupDouble = new NHashMap<Double>();
|
||||
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
|
||||
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
|
||||
}
|
||||
|
||||
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
|
||||
// Looping over the entrySet is really expensive but worth it
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
|
||||
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
}
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
|
||||
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
}
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
|
||||
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
|
||||
}
|
||||
}
|
||||
|
||||
dataCollapsedQualityScore.clear();
|
||||
dataCollapsedByCovariate.clear();
|
||||
dataCollapsedQualityScore = null; // will never need this again
|
||||
dataCollapsedByCovariate = null; // will never need this again
|
||||
if( data!=null ) {
|
||||
data.clear();
|
||||
data = null; // will never need this again
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
|
||||
* @return The desired collapsed NHashMap<Double>
|
||||
*/
|
||||
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,454 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
|
||||
|
||||
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
|
||||
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
|
||||
* This walker then outputs a new bam file with these updated (recalibrated) reads.
|
||||
*
|
||||
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
|
||||
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
|
||||
*/
|
||||
|
||||
@WalkerName("TableRecalibrationRefactored")
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the
|
||||
// GATK to not actually spend time giving me the refBase array since I don't use it
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
|
||||
private String RECAL_FILE = null;
|
||||
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
|
||||
private SAMFileWriter OUTPUT_BAM = null;
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
|
||||
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 3;
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
private int SMOOTHING = 1;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
|
||||
private boolean USE_SLX_PLATFORM = false;
|
||||
|
||||
//public enum RecalibrationMode {
|
||||
// COMBINATORIAL,
|
||||
// SEQUENTIAL,
|
||||
// ERROR
|
||||
//}
|
||||
|
||||
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
||||
private String MODE_STRING = "SEQUENTIAL";
|
||||
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
||||
|
||||
private RecalDataManager dataManager;
|
||||
private ArrayList<Covariate> requestedCovariates;
|
||||
private ArrayList<Comparable> fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values
|
||||
private ArrayList<Comparable> collapsedTableKey; // the key that will be used over and over again to query the collapsed tables
|
||||
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
|
||||
//private final String versionNumber = "2.0.0"; // major version, minor version, and build number
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Read in the recalibration table input file.
|
||||
* Parse the list of covariate classes used during CovariateCounterWalker.
|
||||
* Parse the CSV data and populate the hashmap.
|
||||
*/
|
||||
public void initialize() {
|
||||
|
||||
//logger.info( "TableRecalibrationWalker version: " + versionNumber );
|
||||
// Get a list of all available covariates
|
||||
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
|
||||
|
||||
// Warn the user if a dbSNP file was specified since it isn't being used here
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
if( foundDBSNP ) {
|
||||
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
|
||||
}
|
||||
|
||||
fullCovariateKey = new ArrayList<Comparable>();
|
||||
collapsedTableKey = new ArrayList<Comparable>();
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
// Read in the data from the csv file and populate the map
|
||||
logger.info( "Reading in the data from input file..." );
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
|
||||
} else { // found another covariate in input file
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
|
||||
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
// some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // found some data
|
||||
if( !foundAllCovariates ) {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
}
|
||||
foundAllCovariates = true;
|
||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||
final boolean createCollapsedTables = true;
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Can not find input file: " + RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
|
||||
// Create the collapsed tables that are used in the sequential calculation
|
||||
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||
logger.info( "...done!" );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
ArrayList<Comparable> key = new ArrayList<Comparable>();
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( iii == 1 ) { // order is different in the old recalibrator
|
||||
key.add( cov.getValue( vals[2] ) );
|
||||
} else if ( iii == 2 ) {
|
||||
key.add( cov.getValue( vals[1] ) );
|
||||
} else {
|
||||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
} else {
|
||||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
}
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
|
||||
dataManager.addToAllTables( key, datum );
|
||||
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
||||
* @param refBases References bases over the length of the read
|
||||
* @param read The read to be recalibrated
|
||||
* @return The read with quality scores replaced
|
||||
*/
|
||||
public SAMRecord map( char[] refBases, SAMRecord read ) {
|
||||
|
||||
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
|
||||
// This is done in order to speed up the code
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
originalQuals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// These calls are all expensive so only do them once for each read
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
final String readGroupId = readGroup.getReadGroupId();
|
||||
String platform = readGroup.getPlatform();
|
||||
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
platform = "ILLUMINA";
|
||||
}
|
||||
byte[] bases = read.getReadBases();
|
||||
int startPos = 1;
|
||||
int stopPos = bases.length;
|
||||
|
||||
if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
|
||||
startPos = 0;
|
||||
stopPos = bases.length - 1;
|
||||
}
|
||||
|
||||
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
|
||||
|
||||
// For each base in the read
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
|
||||
|
||||
// Get the covariate values which make up the key
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here
|
||||
}
|
||||
|
||||
recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey );
|
||||
fullCovariateKey.clear();
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
|
||||
if( platform.equalsIgnoreCase("SOLID") ) {
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
|
||||
}
|
||||
|
||||
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
|
||||
|
||||
String readGroupKeyElement = key.get(0).toString();
|
||||
int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
|
||||
byte qualFromRead = (byte)qualityScoreKeyElement;
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
collapsedTableKey.add( readGroupKeyElement );
|
||||
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey);
|
||||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey);
|
||||
double globalDeltaQ = 0.0;
|
||||
double aggregrateQreported = 0.0;
|
||||
if( globalDeltaQDatum != null ) {
|
||||
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) );
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
collapsedTableKey.add( qualityScoreKeyElement );
|
||||
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey);
|
||||
double deltaQReported = 0.0;
|
||||
if( deltaQReportedEmpirical != null ) {
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
Double deltaQCovariateEmpirical;
|
||||
double deltaQPos = 0.0;
|
||||
double deltaQDinuc = 0.0;
|
||||
for( int iii = 2; iii < key.size(); iii++ ) {
|
||||
collapsedTableKey.add( key.get(iii) ); // the given covariate
|
||||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
|
||||
if( deltaQCovariateEmpirical != null ) {
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator
|
||||
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
|
||||
}
|
||||
}
|
||||
collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list
|
||||
}
|
||||
|
||||
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
byte newQualityByte = QualityUtils.boundQual( (int)Math.round(newQuality), QualityUtils.MAX_REASONABLE_Q_SCORE );
|
||||
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
collapsedTableKey.clear();
|
||||
return newQualityByte;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param colorSpaceQuals The list of color space quality scores for this read
|
||||
*/
|
||||
private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Nothing start the reduce with a new handle to the output bam file
|
||||
* @return A FileWriter pointing to a new bam file
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
return OUTPUT_BAM;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output each read to disk
|
||||
* @param read The read to output
|
||||
* @param output The FileWriter to write the read to
|
||||
* @return The FileWriter
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
out.println(read.format());
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,184 +0,0 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import java.util.*;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
||||
// the imports for unit testing.
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Basic unit test for RecalData
|
||||
*/
|
||||
public class CovariateCounterTest extends BaseTest {
|
||||
String readGroup1 = "rg1";
|
||||
String readGroup2 = "rg2";
|
||||
Set<String> readGroups = new HashSet<String>();
|
||||
|
||||
SAMFileHeader header;
|
||||
|
||||
SAMRecord read1, read2, read3;
|
||||
|
||||
byte bases1[] = {'a', 't', 'c', 'g', 'a'};
|
||||
byte quals1[] = {1, 2, 3, 4, 5};
|
||||
byte quals3[] = {1, 2, 5, 5, 5};
|
||||
byte bases2[] = {'t', 'c', 'g', 'a', 't'};
|
||||
byte quals2[] = {2, 2, 4, 5, 2};
|
||||
|
||||
/*
|
||||
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
|
||||
public Set<String> getReadGroups() {
|
||||
public boolean isCollapseDinuc() {
|
||||
public boolean isCollapsePos() {
|
||||
public int getNReadGroups() {
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
public List<RecalData> getRecalData(String readGroup) {
|
||||
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||
*/
|
||||
|
||||
/**
|
||||
* The fasta, for comparison.
|
||||
*/
|
||||
protected static IndexedFastaSequenceFile sequenceFile = null;
|
||||
|
||||
CovariateCounter c;
|
||||
|
||||
/**
|
||||
* Initialize the fasta.
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void initialize() throws FileNotFoundException {
|
||||
sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") );
|
||||
GenomeLocParser.setupRefContigOrdering(sequenceFile);
|
||||
|
||||
}
|
||||
|
||||
@Before
|
||||
public void initializeBefore() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719);
|
||||
readGroups.addAll(Arrays.asList(readGroup1, readGroup2));
|
||||
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" );
|
||||
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" );
|
||||
c = new CovariateCounter( readGroups, false, false, false );
|
||||
|
||||
read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1);
|
||||
read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2);
|
||||
read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCovariateCounterSetup() {
|
||||
Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2);
|
||||
Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups);
|
||||
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false);
|
||||
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOneRead() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, 1);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoReads() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, 1);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoReadsSameGroup() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, 2);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoReadsSameGroupNotIdentical() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
|
||||
for ( int i = 1; i < read3.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i], false);
|
||||
c.printState();
|
||||
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test (expected = RuntimeException.class)
|
||||
public void testBadReadOffset() {
|
||||
byte bases[] = {'a', 't', 'c', 'g', 'a'};
|
||||
byte quals[] = {1, 2, 3, 4, 5};
|
||||
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
|
||||
|
||||
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0], false);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import java.util.HashSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Basic unit test for RecalData
|
||||
*/
|
||||
public class RecalDataTest extends BaseTest {
|
||||
RecalData datum, starDatum;
|
||||
|
||||
/**
|
||||
* Tests that we got a string parameter in correctly
|
||||
*/
|
||||
@Before
|
||||
public void before() {
|
||||
datum = new RecalData(2, 3, "0", "AA");
|
||||
datum.B = datum.N = 1;
|
||||
starDatum = new RecalData(2, 3, "0", "**");
|
||||
starDatum.B = starDatum.N = 1;
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testBasic() {
|
||||
logger.warn("Executing testIsBetween");
|
||||
|
||||
Assert.assertTrue(datum.N == 1);
|
||||
Assert.assertTrue(datum.B == 1);
|
||||
Assert.assertTrue(datum.pos == 2);
|
||||
Assert.assertTrue(datum.qual == 3);
|
||||
Assert.assertTrue(datum.readGroup.equals("0"));
|
||||
Assert.assertTrue(datum.dinuc.equals("AA"));
|
||||
|
||||
Assert.assertTrue(starDatum.N == 1);
|
||||
Assert.assertTrue(starDatum.B == 1);
|
||||
Assert.assertTrue(starDatum.pos == 2);
|
||||
Assert.assertTrue(starDatum.qual == 3);
|
||||
Assert.assertTrue(starDatum.readGroup.equals("0"));
|
||||
Assert.assertTrue(starDatum.dinuc.equals("**"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInc() {
|
||||
logger.warn("Executing testInc");
|
||||
|
||||
datum.inc(1L, 0L);
|
||||
Assert.assertTrue(datum.N == 2);
|
||||
Assert.assertTrue(datum.B == 1);
|
||||
|
||||
datum.inc('A', 'A');
|
||||
Assert.assertTrue(datum.N == 3);
|
||||
Assert.assertTrue(datum.B == 1);
|
||||
|
||||
datum.inc('A', 'C');
|
||||
Assert.assertTrue(datum.N == 4);
|
||||
Assert.assertTrue(datum.B == 2);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmpQual() {
|
||||
logger.warn("Executing testEmpQual");
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 1;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 0.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 1);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 2;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 3.0103, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 3);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 3;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 4.771213, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 5);
|
||||
|
||||
datum.B = 1;
|
||||
datum.B = 2;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 1.760913, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 2);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 10;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 10.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 10);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 100;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 20.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 20);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 1000;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 30.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 30);
|
||||
|
||||
datum.B = 0;
|
||||
datum.N = 1000;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), QualityUtils.MAX_REASONABLE_Q_SCORE, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), QualityUtils.MAX_REASONABLE_Q_SCORE);
|
||||
}
|
||||
|
||||
|
||||
public void testtoCSVString() {
|
||||
logger.warn("Executing testtoCSVString");
|
||||
|
||||
Assert.assertEquals(datum.toCSVString(false), "0,AA,3,2,1,1,0");
|
||||
Assert.assertEquals(datum.toCSVString(true), "0,AA,3,*,1,1,0");
|
||||
}
|
||||
|
||||
|
||||
public void testFromCSVString() {
|
||||
logger.warn("Executing testFromCSVString");
|
||||
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,AA,3,2,1,1,0").toCSVString(false), datum.toCSVString(false));
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,AA,3,*,1,1,0").toCSVString(false), datum.toCSVString(true));
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,**,3,2,1,1,0").toCSVString(false), starDatum.toCSVString(false));
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,**,3,*,1,1,0").toCSVString(false), starDatum.toCSVString(true));
|
||||
}
|
||||
|
||||
public void testDinucIndex() {
|
||||
logger.warn("Executing testDinucIndex");
|
||||
|
||||
HashSet<Integer> indices = new HashSet<Integer>();
|
||||
HashSet<Byte> unknownBytes = new HashSet<Byte>();
|
||||
byte bases[] = {'A', 'C', 'G', 'T', '*', 'N'};
|
||||
unknownBytes.add((byte)'*');
|
||||
unknownBytes.add((byte)'N');
|
||||
|
||||
for ( int i = 0; i < bases.length; i++ ) {
|
||||
for ( int j = 0; j < bases.length; j++ ) {
|
||||
byte[] bp = {bases[i], bases[j]};
|
||||
String s = new String(bp);
|
||||
int index = RecalData.dinucIndex(s);
|
||||
indices.add(index);
|
||||
Assert.assertEquals(index, RecalData.dinucIndex(bases[i], bases[j]));
|
||||
if ( index != -1 ) {
|
||||
Assert.assertEquals(RecalData.dinucIndex2bases(index), s);
|
||||
} else {
|
||||
Assert.assertTrue(unknownBytes.contains(bp[0]) || unknownBytes.contains(bp[1]) );
|
||||
}
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(indices.size(), 17);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -29,7 +29,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -T CountCovariates" +
|
||||
" -I " + bam +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --params %s",
|
||||
" --validate_old_recalibrator" +
|
||||
" --use_slx_platform" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
List<File> result = executeTest("testCountCovariates1", spec).getFirst();
|
||||
|
|
@ -56,8 +58,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -T TableRecalibration" +
|
||||
" -I " + bam +
|
||||
" -L 1:10,000,000-20,000,000" +
|
||||
" --outputBam %s" +
|
||||
" --params " + paramsFile,
|
||||
" --validate_old_recalibrator" +
|
||||
" --use_slx_platform" +
|
||||
" -outputBam %s" +
|
||||
" -recalFile " + paramsFile,
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibrator1", spec);
|
||||
|
|
|
|||
|
|
@ -1,71 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
|
||||
public class RefactoredRecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
|
||||
|
||||
@Test
|
||||
public void testCountCovariatesR() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R /broad/1KG/reference/human_b36_both.fasta" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -T CountCovariatesRefactored" +
|
||||
" -I " + bam +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --validate_old_recalibrator" +
|
||||
" --use_slx_platform" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
List<File> result = executeTest("testCountCovariatesR", spec).getFirst();
|
||||
paramsFiles.put(bam, result.get(0).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTableRecalibratorR() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "3ace4b9b8495429cc32e7008145f4996" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
|
||||
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
String paramsFile = paramsFiles.get(bam);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
|
||||
if ( paramsFile != null ) {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R /broad/1KG/reference/human_b36_both.fasta" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -T TableRecalibrationRefactored" +
|
||||
" -I " + bam +
|
||||
" -L 1:10,000,000-20,000,000" +
|
||||
" --validate_old_recalibrator" +
|
||||
" --use_slx_platform" +
|
||||
" -outputBam %s" +
|
||||
" -recalFile " + paramsFile,
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibratorR", spec);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -8,7 +8,6 @@ import org.junit.Assert;
|
|||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ import org.junit.Assert;
|
|||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
|
|
|||
Loading…
Reference in New Issue