The old recalibrator is replaced with the refactored recalibrator. Added a version message to the logger output. These walkers start at version 2.0.0

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2117 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-23 14:58:33 +00:00
parent dfe7d69471
commit 1d46de6d34
27 changed files with 981 additions and 2807 deletions

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;

View File

@ -1,148 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
import org.apache.log4j.Logger;
import javax.management.RuntimeErrorException;
import java.util.*;
public class CovariateCounter {
private boolean collapsePos = false;
private boolean collapseDinuc = false;
private boolean assumeFaultyHeader = false;
private HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
protected static Logger logger = Logger.getLogger(CovariateCounter.class);
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc, boolean assumeFaultyHeader ) {
this.collapsePos = collapsePos;
this.collapseDinuc = collapseDinuc;
this.assumeFaultyHeader = assumeFaultyHeader;
for (String readGroup : readGroups ) {
RecalDataManager manager = makeManager(readGroup);
data.put(readGroup, manager);
}
}
private RecalDataManager makeManager(final String readGroup) {
return new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc );
}
/**
* Returns the set of readGroup names we are counting covariates for
* @return
*/
public Set<String> getReadGroups() {
return data.keySet();
}
public boolean isCollapseDinuc() {
return collapseDinuc;
}
public boolean isCollapsePos() {
return collapsePos;
}
/**
* Returns the number of read groups being managed
* @return
*/
public int getNReadGroups() {
return data.size();
}
/**
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
* system so that subsequent requests will return that object, and returns it.
*
* @param readGroup
* @param pos
* @param qual
* @param prevBase
* @param base
* @return
*/
public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
if ( ! data.containsKey(readGroup) ) {
if ( assumeFaultyHeader ) {
logger.info(String.format("Found unexpected read group, but assuming the header was bad, so extending covariates with read group %s", readGroup));
RecalDataManager manager = makeManager(readGroup);
data.put(readGroup, manager);
} else {
throw new RuntimeException(String.format("Unexpected read group %s found, there's something wrong with your BAM file's header", readGroup));
}
}
byte[] cs = {(byte)prevBase, (byte)base};
String s = new String(cs);
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
}
/**
* Get a list of all of the RecalData associated with readGroup
*
* @param readGroup
* @return
*/
public List<RecalData> getRecalData(String readGroup) {
return data.get(readGroup).getAll();
}
/**
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
* but by implied machine cycle associated with the offset.
*
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
* TODO: and offset is 0-based. How very annoying.
*
* @param rg
* @param read
* @param offset
* @param ref
* @return
*/
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref, boolean useOriginalQuals ) {
if ( offset == 0 )
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
int cycle = offset;
byte[] bases = read.getReadBases();
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
char base = (char)bases[offset];
char prevBase = (char)bases[offset - 1];
if (read.getReadNegativeStrandFlag()) {
ref = BaseUtils.simpleComplement(ref);
base = BaseUtils.simpleComplement(base);
prevBase = BaseUtils.simpleComplement((char)bases[offset+1]);
cycle = read.getReadLength() - (offset + 1);
}
int qual = quals[offset];
if ( qual > 0 ) {
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref);
return 1;
} else {
return 0;
}
}
public void printState() {
for ( String readGroup : getReadGroups() ) {
for ( RecalData datum : getRecalData(readGroup) ) {
if ( datum.N > 0 )
System.out.println(datum);
}
}
}
}

View File

@ -1,292 +1,485 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import java.util.*;
import java.io.PrintStream;
import java.io.FileNotFoundException;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
@WalkerName("CountCovariates")
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal operating only at sites that are not in dbSNP.
* We assume that all mismatches we see are therefore errors.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Since there is a large amount of data one can then calculate an empirical probability of error
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
* The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score)
* The first lines of the output file give the name of the covariate classes that were used for this calculation.
*
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list.
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
*/
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@WalkerName( "CountCovariates" )
//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
public int buggyMaxReadLen = 100000;
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Depreciated output file root -- now use --params to directly specify the file output name")
public String OUTPUT_FILEROOT = null; // going to blow up if specified
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private Boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
private boolean NO_PRINT_HEADER = false;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
private boolean USE_SLX_PLATFORM = false;
@Argument(fullName="params", shortName="params", required=false, doc="Filename root for the outputted logistic regression training files")
public String params = "output.recal_data.csv";
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
public int MIN_MAPPING_QUALITY = 1;
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
private int sizeOfReadDatumHashMap = 0;
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
public List<String> platforms = Collections.singletonList("*");
//public List<String> platforms = Collections.singletonList("ILLUMINA");
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
@Argument(fullName="assumeFaultyHeader", required=false, doc="")
public boolean assumeFaultyHeader = false;
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
//@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
public boolean collapsePos = false;
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false;
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean useOriginalQuals = false;
private CovariateCounter covariateCounter = null;
private long counted_sites = 0; // number of sites used to count covariates
private long counted_bases = 0; // number of bases used to count covariates
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
// THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
// whether masked by overlapping indels/other events or not.
//TODO process correctly all the returned dbSNP rods at each location
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
private static final int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
/**
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
* Parse the -cov arguments and create a list of covariates to be used here
* Based on the covariates' estimates for initial capacity allocate the data hashmap
*/
public void initialize() {
Set<String> readGroups = new HashSet<String>();
for (SAMReadGroupRecord readGroup : this.getToolkit().getSAMFileHeader().getReadGroups()) {
if( readGroup.getAttribute("PL") == null )
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
if( !isSupportedReadGroup(readGroup) )
continue;
readGroups.add(readGroup.getReadGroupId());
logger.info( "CovariateCounterWalker version: " + versionNumber );
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println( "Available covariates:" );
for( Class<?> covClass : classes ) {
out.println( covClass.getSimpleName() );
}
out.println();
System.exit( 0 ); // early exit here because user requested it
}
// Warn the user if no dbSNP file was specified
if( this.getToolkit().getArguments().DBSNPFile == null ) {
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
foundDBSNP = true;
}
}
if( !foundDBSNP ) {
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
}
covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc, assumeFaultyHeader);
logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups()));
}
// --------------------------------------------------------------------------------------------------------------
//
// map
//
// --------------------------------------------------------------------------------------------------------------
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
if( VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() );
} else if( COVARIATES != null ) {
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
requestedCovariates.add( new QualityScoreCovariate() );
for( Class<?> covClass : classes ) {
try {
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
requestedCovariates.add( covariate );
}
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
} else { // The user has specified a list of several covariates
int covNumber = 1;
for( String requestedCovariateString : COVARIATES ) {
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
}
covNumber++;
try {
// Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
/**
* Walk over each read in the locus pileup and update the covariate counts based on these bases and their
* matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP.
*
* @param tracker
* @param ref
* @param context
* @return
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
// long testpos = 10410913 ;
// if ( ref.getLocus().getStart() == testpos ) {
// System.out.println(rods.size()+" rods:");
// for ( ReferenceOrderedDatum d : rods.getRecords() ) System.out.println((((rodDbSNP)d).isSNP()?"SNP":"non-SNP")+" at " + d.getLocation());
// System.exit(1);
// }
if ( dbsnp == null ) {
// We aren't at a dbSNP position that's a SNP, so update the read
// if ( ref.getLocus().getStart() == testpos) System.out.println("NOT A SNP INDEED");
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
if ( read.getReadLength() > buggyMaxReadLen ) {
throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format());
}
SAMReadGroupRecord readGroup = read.getReadGroup();
if ( readGroup == null ) {
throw new RuntimeException("No read group annotation found for read " + read.format());
}
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
int offset = offsets.get(i);
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref.getBase(), useOriginalQuals);
if( !foundClass ) {
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
}
}
}
counted_sites += 1;
updateMismatchCounts(novel_counts, context, ref.getBase());
} else {
// if ( ref.getLocus().getStart() == testpos) System.out.println("TREATED AS A SNP");
// out.println(ref.getLocus()+" SNP at "+dbsnp.getLocation() );
updateMismatchCounts(dbSNP_counts, context, ref.getBase());
skipped_sites += 1;
} else { // No covariates were specified by the user so add the default, required ones
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
estimatedCapacity = 300 * 40;
}
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
lociSinceLastDbsnpCheck = 0;
validateDbsnpMismatchRate();
}
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
return 1;
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
dataManager = new RecalDataManager( estimatedCapacity );
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
}
/**
* Update the mismatch / total_base counts for a given class of loci.
*
* @param counts
* @param context
* @return
*/
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) {
char readChar = reads.get(i).getReadString().charAt(offsets.get(i));
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
if ( readCharBaseIndex != refCharBaseIndex )
counts.first++;
counts.second++;
}
}
}
/**
* Validate the dbSNP mismatch rates.
*
* @return
*/
private void validateDbsnpMismatchRate() {
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 )
return;
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel )
Utils.warnUser("The variation rate of dbSNP sites seems suspicious! Please double-check that the correct DBSNP ROD is being used.");
}
/**
* Check to see whether this read group should be processed. Returns true if the
* read group is in the list of platforms to process or the platform == *, indicating
* that all platforms should be processed.
*
* @param readGroup
* @return
*/
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
for( String platform: platforms ) {
platform = platform.trim();
if( platform.equals("*") || readGroup == null ||
readGroup.getAttribute("PL") == null ||
readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
return true;
}
return false;
}
// --------------------------------------------------------------------------------------------------------------
//---------------------------------------------------------------------------------------------------------------
//
// Reduce
// map
//
// --------------------------------------------------------------------------------------------------------------
//---------------------------------------------------------------------------------------------------------------
/**
* Provide an initial value for reduce computations.
* @return Initial value of reduce.
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
* @param tracker The reference metadata tracker
* @param ref The reference context
* @param context The alignment context
* @return Returns 1, but this value isn't used in the reduce step
*/
public PrintStream reduceInit() {
try {
if ( OUTPUT_FILEROOT != null )
throw new RuntimeException("OUTPUT_FILEROOT argument has been removed, please use --params from now on to directly specify the output parameter filename");
return new PrintStream( params );
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open output file", e);
}
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
public void onTraversalDone(PrintStream recalTableStream) {
out.printf("Writing raw recalibration data...");
writeRecalTable(recalTableStream);
out.printf("...done%n");
//out.printf("Writing logistic recalibration data%n");
//writeLogisticRecalibrationTable();
//out.printf("...done%n");
recalTableStream.close();
}
/**
* Prints some basic information about the CountCovariates run to the output stream out
* @param out
*/
private void printInfo(PrintStream out) {
//out.printf("# date \"%s\"%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
out.printf("# counted_sites %d%n", counted_sites);
out.printf("# counted_bases %d%n", counted_bases);
out.printf("# skipped_sites %d%n", skipped_sites);
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
}
/**
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
*/
private void writeRecalTable(PrintStream recalTableStream) {
printInfo(recalTableStream);
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
for (String readGroup : new TreeSet<String>(covariateCounter.getReadGroups()) ) {
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
if ( datum.N > 0 ) {
recalTableStream.println(datum.toCSVString(collapsePos));
// pull out anything passed by -B name,type,file that has the name "dbsnp"
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
boolean isSNP = false;
if (dbsnpRODs != null) {
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
if( ((Variation)rod).isSNP() ) {
isSNP = true; // at least one of the rods says this is a snp site
break;
}
}
}
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
if( !isSNP ) {
final List<SAMRecord> reads = context.getReads();
final List<Integer> offsets = context.getOffsets();
SAMRecord read;
int offset;
String readGroupId;
byte[] quals;
byte[] bases;
byte refBase;
byte prevBase;
String platform;
byte[] colorSpaceQuals;
ReadHashDatum readDatum;
boolean isNegStrand;
int mappingQuality;
int length;
final int numReads = reads.size();
// For each read at this locus
for( int iii = 0; iii < numReads; iii++ ) {
read = reads.get(iii);
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
readDatum = readDatumHashMap.get( read );
if( readDatum == null ) {
// If the HashMap of read objects has grown too large then throw out the (mostly stale) reads
if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger?
readDatumHashMap.clear();
sizeOfReadDatumHashMap = 0;
}
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
isNegStrand = read.getReadNegativeStrandFlag();
final SAMReadGroupRecord readGroup = read.getReadGroup();
readGroupId = readGroup.getReadGroupId();
platform = readGroup.getPlatform();
mappingQuality = read.getMappingQuality();
length = bases.length;
if( USE_SLX_PLATFORM ) {
platform = "ILLUMINA";
}
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
readDatumHashMap.put( read, readDatum );
sizeOfReadDatumHashMap++;
}
if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
// skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < readDatum.length - 1 ) {
// skip if base quality is zero
if( readDatum.quals[offset] > 0 ) {
refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
colorSpaceQuals = null;
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// This base finally passed all the checks, so add it to the big hashmap
updateDataFromRead( readDatum, offset, refBase );
}
} else {
if( VALIDATE_OLD_RECALIBRATOR ) {
countedBases++; // replicating a small bug in the old recalibrator
}
}
}
} else { // at the last base in the read so we can remove it from our IdentityHashMap
readDatumHashMap.remove( read );
sizeOfReadDatumHashMap--;
}
}
}
}
countedSites++;
} else { // We skipped over the dbSNP site
skippedSites++;
}
return 1; // This value isn't actually used anywhere
}
/**
* Doesn't do anything
*
* @param empty
* @param recalTableStream
* @return
* Major workhorse routine for this walker.
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation.
* @param readDatum The ReadHashDatum holding all the important properties of this read
* @param offset The offset in the read for this locus
* @param refBase The reference base at this locus
*/
public PrintStream reduce(Integer empty, PrintStream recalTableStream) {
return recalTableStream;
private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) {
List<Comparable> key = new ArrayList<Comparable>();
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( readDatum, offset ) );
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
RecalDatum datum = dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( VALIDATE_OLD_RECALIBRATOR ) {
dataManager.data.myPut( key, datum );
} else {
dataManager.data.put( key, datum );
}
}
// Need the bases to determine whether or not we have a mismatch
byte base = readDatum.bases[offset];
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
countedBases++;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
/**
* Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker.
* @return returns A PrintStream created from the -rf filename
*/
public PrintStream reduceInit() {
try {
return new PrintStream( RECAL_FILE );
} catch ( FileNotFoundException e ) {
throw new RuntimeException( "Couldn't open output file: ", e );
}
}
/**
* The Reduce method doesn't do anything for this walker.
* @param value Result of the map. This value is immediately ignored.
* @param recalTableStream The PrintStream used to output the CSV data
* @return returns The PrintStream used to output the CSV data
*/
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
return recalTableStream; // nothing to do here
}
/**
* Write out the full data hashmap to disk in CSV format
* @param recalTableStream The PrintStream to write out to
*/
public void onTraversalDone( PrintStream recalTableStream ) {
logger.info( "Writing raw recalibration data..." );
outputToCSV( recalTableStream );
logger.info( "...done!" );
recalTableStream.close();
}
/**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
* @param recalTableStream The PrintStream to write out to
*/
private void outputToCSV( final PrintStream recalTableStream ) {
if( VALIDATE_OLD_RECALIBRATOR ) {
recalTableStream.printf("# collapsed_pos false%n");
recalTableStream.printf("# collapsed_dinuc false%n");
recalTableStream.printf("# counted_sites %d%n", countedSites);
recalTableStream.printf("# counted_bases %d%n", countedBases);
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n");
// For each entry in the data hashmap
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
// For each Covariate in the key
for( Comparable comp : entry.first ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.second.outputToCSV() );
}
} else {
if( !NO_PRINT_HEADER ) {
recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
for( Covariate cov : requestedCovariates ) {
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
}
}
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
}
}
}

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
/**
* Created by IntelliJ IDEA.

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.Utils;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.StingException;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
/**
* Created by IntelliJ IDEA.

View File

@ -1,230 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.List;
import java.util.Collections;
public class RecalData implements Comparable<RecalData> {
//
// context of this data -- read group, position in read, reported quality, and dinuc
//
String readGroup; // the read group where the datum was observed
int pos; // the position of this recalibration datum
int qual; // the reported quality scoree of this datum
String dinuc; // the dinucleotide context of this datum
//
// empirical quality score data
//
long N; // total number of observed bases
long B; // number of observed mismatches to the reference base
public RecalData(int pos, int qual, String readGroup, String dinuc ) {
this.pos = pos;
this.qual = qual;
this.readGroup = readGroup;
this.dinuc = dinuc;
}
public RecalData( RecalData copy ) {
this.pos = copy.pos;
this.qual = copy.qual;
this.readGroup = copy.readGroup;
this.dinuc = copy.dinuc;
this.N = copy.N;
this.B = copy.B;
}
public int compareTo(RecalData to) {
int cmpReadGroup = readGroup.compareTo(to.readGroup);
if (cmpReadGroup != 0) return cmpReadGroup;
int cmpPos = new Integer(pos).compareTo(to.pos);
if (cmpPos != 0) return cmpPos;
int cmpQual = new Integer(qual).compareTo(to.qual);
if (cmpQual != 0) return cmpQual;
return dinuc.compareTo(to.dinuc);
}
/**
* Returns the expected number of mismatching bases for this datum, using the number of bases N and
* the reported qual score. Effectively qual implied error rate * number of bases.
*
* @return
*/
public double getExpectedErrors() {
return QualityUtils.qualToErrorProb((byte)qual) * N;
}
//
// Increment routines
//
public RecalData inc(long incN, long incB) {
N += incN;
B += incB;
return this;
}
/**
* Add N and B from other to self
*
* @param other
* @return
*/
public RecalData inc(final RecalData other) {
return inc(other.N, other.B);
}
/**
* Add all of the Ns and Bs for data to self
*
* @param data
* @return
*/
public RecalData inc(List<RecalData> data) {
for ( RecalData other : data ) {
this.inc(other);
}
return this;
}
/**
* Count an empirical observation of our current base against the reference base.
* Increments N and, if curBase != ref, also increments B
*
* @param curBase
* @param ref
* @return
*/
public RecalData inc(char curBase, char ref) {
return inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
//out.printf("%s %s\n", curBase, ref);
}
/**
* Get the integer dinuc index associated with this datum. Can return -1 if dinuc
* contains unexpected bases. Returns 0 if dinuc contains the * operator (not tracking dinucs)
* @return
*/
public int getDinucIndex() {
return dinucIndex(this.dinuc);
}
/**
* Calculates the empirical quality score for this datum from B and N. Bounds the result
* by QualityUtils.MAX_REASONABLE_Q_SCORE.
*
* @return
*/
public double empiricalQualDouble(final boolean useRawQempirical) {
double doubleB = useRawQempirical ? B : B + 1;
double doubleN = useRawQempirical ? N : N + 1;
double empiricalQual = -10 * Math.log10(doubleB / doubleN);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
return empiricalQual;
}
public double empiricalQualDouble() { return empiricalQualDouble(true); }
/**
* As as empiricalQualDouble, but rounded and encoded as a byte for placement in a read quality array of bytes
* @return
*/
public byte empiricalQualByte(final boolean useRawQempirical) {
double doubleB = useRawQempirical ? B : B + 1;
double doubleN = useRawQempirical ? N : N + 1;
return QualityUtils.probToQual(1.0 - doubleB / doubleN);
}
public byte empiricalQualByte() { return empiricalQualByte(true); }
public static String headerString() {
return ("pos, rg, dinuc, qual, emp_qual, qual_diff, n, b");
}
public String toString() {
return String.format("[rg=%s, pos=%3d, Qrep=%3d, dinuc=%s, Qemp=%2.2f / %2.2f, B=%d, N=%d]", readGroup, pos, qual, dinuc, empiricalQualDouble(), empiricalQualDouble(false), B, N);
//return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
}
//
// To and from CSV strings
//
public String toCSVString(boolean collapsedPos) {
// rg,pos,Qrep,dn,nBases,nMismatches,Qemp
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, collapsedPos ? "*" : pos, qual, dinuc, N, B, empiricalQualByte());
}
/**
* Parses the string s and returns the encoded RecalData in it. Assumes data has been emitted by
* toCSVString() routine. Order is readGroup, dinuc, qual, pos, N, B, Qemp as byte
* @param s
* @return
*/
public static RecalData fromCSVString(String s) throws NumberFormatException {
String[] vals = s.split(",");
String rg = vals[0];
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
int qual = Integer.parseInt(vals[2]);
String dinuc = vals[3];
int N = Integer.parseInt(vals[4]);
int B = Integer.parseInt(vals[5]);
RecalData datum = new RecalData(pos, qual, rg, dinuc);
datum.B = B;
datum.N = N;
// Checking for badness
if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos);
if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B);
if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N);
if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual);
return datum;
}
public static List<RecalData> sort(List<RecalData> l) {
Collections.sort(l);
return l;
}
public static int bases2dinucIndex(char prevBase, char base) {
int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
int bI = BaseUtils.simpleBaseToBaseIndex(base);
return (pbI == -1 || bI == -1) ? -1 : pbI * 4 + bI;
}
public final static int NDINUCS = 16;
public static String dinucIndex2bases(int index) {
char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)};
return new String( data );
}
public static int dinucIndex(String s) {
return bases2dinucIndex(s.charAt(0), s.charAt(1));
}
public static int dinucIndex(byte prevBase, byte base) {
return bases2dinucIndex((char)prevBase, (char)base);
}
public static double combinedQreported(List<RecalData> l) {
double sumExpectedErrors = 0;
long nBases = 0;
for ( RecalData datum : l ) {
sumExpectedErrors += datum.getExpectedErrors();
nBases += datum.N;
}
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
//System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
// sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
return q;
}
}

View File

@ -1,195 +1,204 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Jun 16, 2009
* Time: 9:55:10 PM
* To change this template use File | Settings | File Templates.
* User: rpoplin
* Date: Nov 6, 2009
*
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
*/
public class RecalDataManager {
/** The original quality scores are stored in this attribute */
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
public NHashMap<RecalDatum> data; // the full dataset
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
boolean trackPos, trackDinuc;
String readGroup;
int nDinucs, maxReadLen;
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
//RecalData[][][] data = null;
ExpandingArrayList<ExpandingArrayList<ExpandingArrayList<RecalData>>> data = null;
public RecalDataManager(String readGroup,
//int maxReadLen, int maxQual, int nDinucs,
boolean trackPos, boolean trackDinuc) {
data = new ExpandingArrayList<ExpandingArrayList<ExpandingArrayList<RecalData>>>();
//data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
this.readGroup = readGroup;
this.trackPos = trackPos;
this.trackDinuc = trackDinuc;
//this.maxReadLen = maxReadLen;
//this.nDinucs = nDinucs;
RecalDataManager() {
data = new NHashMap<RecalDatum>();
}
public int getMaxReadLen() {
return data.size();
}
public int getPosIndex(int pos) {
return trackPos ? pos : 0;
}
public int canonicalPos(int cycle) {
return getPosIndex(cycle);
}
public int getDinucIndex(String dinuc) {
return trackDinuc ? RecalData.dinucIndex(dinuc) : 0;
}
public int getDinucIndex(byte prevBase, byte base) {
return trackDinuc ? RecalData.dinucIndex(prevBase, base) : 0;
}
public String canonicalDinuc(String dinuc) {
return trackDinuc ? dinuc : "**";
}
public void addDatum(RecalData datum) {
if ( ! datum.readGroup.equals(this.readGroup) ) {
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
}
RecalData prev = getRecalData(datum.pos, datum.qual, datum.dinuc);
if ( prev != null ) {
if ( trackDinuc && trackPos )
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
prev.inc(datum);
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
if( createCollapsedTables ) { // initialize all the collapsed tables
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
}
dataSumExpectedErrors = new NHashMap<Double>();
} else {
int posIndex = getPosIndex(datum.pos);
int internalDinucIndex = getDinucIndex(datum.dinuc);
if ( internalDinucIndex == -1 ) return;
set(posIndex, datum.qual, internalDinucIndex, datum);
flattenData.add(datum);
}
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
}
}
public RecalData getRecalData(int pos, int qual, String dinuc) {
return expandingGetRecalData(pos, qual, dinuc, false);
}
public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) {
int posIndex = getPosIndex(pos);
int internalDinucIndex = getDinucIndex(dinuc);
if ( internalDinucIndex == -1 ) return null;
RecalData datum = get(posIndex, qual, internalDinucIndex);
if ( datum == null && expandP ) {
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**");
set(posIndex, qual, internalDinucIndex, datum);
flattenData.add(datum);
}
return datum;
RecalDataManager( final int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap
}
/**
* Returns the RecalData associated with pos, qual, dinuc, or null if not is bound.
* Does not expand the system to corporate a new data point
* @param posIndex
* @param qual
* @param dinucIndex
* @return
* Add the given mapping to all of the collapsed hash tables
* @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
*/
private RecalData get(int posIndex, int qual, int dinucIndex) {
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
if (d2 == null) return null;
ExpandingArrayList<RecalData> d3 = d2.get(qual);
return d3 == null ? null : d3.get(dinucIndex);
}
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
// grow data if necessary
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
if (d2 == null) {
d2 = new ExpandingArrayList<ExpandingArrayList<RecalData>>();
data.set(posIndex, d2);
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
// Expand d2 if necessary
ExpandingArrayList<RecalData> d3 = d2.get(qual);
if ( d3 == null ) {
d3 = new ExpandingArrayList<RecalData>();
d2.set(qual, d3);
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
// set d3 to datum
d3.set(dinucIndex, datum);
}
newKey = new ArrayList<Comparable>();
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score
collapsedDatum = dataCollapsedQualityScore.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
private RecalData getMatchingDatum(List<RecalData> l, RecalData datum,
boolean combinePos, boolean combineQual, boolean combineDinuc) {
for ( RecalData find : l ) {
if ( (combineQual || find.qual == datum.qual) &&
(combinePos || find.pos == datum.pos ) &&
(combineDinuc || find.dinuc.equals(datum.dinuc)) ) {
//System.out.printf("Found %s for %s%n", find, datum);
return find;
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score ...
newKey.add( key.get(iii + 2) ); // and the given covariate
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
if( collapsedDatum == null ) {
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
}
RecalData d = new RecalData(combinePos ? 0 : datum.pos, combineQual ? 0 : datum.qual, datum.readGroup, combineDinuc ? "**" : datum.dinuc );
//System.out.printf("Making new match %s%n", d);
l.add(d);
return d;
}
public List<RecalData> combineDinucs() {
return combine(false, false, true);
}
/**
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
* that will be used in the sequential calculation in TableRecalibrationWalker
* @param numCovariates The number of covariates you have determines the number of tables to create
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
*/
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
public List<RecalData> combineCycles() {
return combine(true, false, false);
}
public List<RecalData> combine(boolean ignorePos, boolean ignoreQual, boolean ignoreDinuc) {
List<RecalData> l = new ArrayList<RecalData>();
for ( RecalData datum : flattenData ) {
RecalData reduced = getMatchingDatum(l, datum, ignorePos, ignoreQual, ignoreDinuc);
//System.out.printf("Combining %s with %s%n", datum, reduced);
reduced.inc(datum);
dataCollapsedReadGroupDouble = new NHashMap<Double>();
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
}
return l;
}
public List<RecalData> getAll() {
return flattenData;
}
// we can process the OQ field if requested
public static byte[] getQualsForRecalibration( SAMRecord read, boolean useOriginalQuals ) {
byte[] quals = read.getBaseQualities();
if ( useOriginalQuals && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
// Looping over the entrySet is really expensive but worth it
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
}
return quals;
dataCollapsedQualityScore.clear();
dataCollapsedByCovariate.clear();
dataCollapsedQualityScore = null; // will never need this again
dataCollapsedByCovariate = null; // will never need this again
if( data!=null ) {
data.clear();
data = null; // will never need this again
}
}
/**
* Get the appropriate collapsed table out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
*/
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
} else if( covariate == 1 ) {
return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
} else {
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
}
}
/**
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
* @return The desired collapsed NHashMap<Double>
*/
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
} else if( covariate == 1 ) {
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
} else {
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
}
}
}

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;

View File

@ -1,3 +1,22 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -23,311 +42,408 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.recalibration;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
* This walker then outputs a new bam file with these updated (recalibrated) reads.
*
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
*/
@WalkerName("TableRecalibration")
@Requires({DataSource.READS}) // , DataSource.REFERENCE})
@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the
// GATK to not actually spend time giving me the refBase array since I don't use it
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(fullName="params", shortName="params", doc="CountCovariates params file", required=true)
public String paramsFile;
@Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=true)
public SAMFileWriter outputBam = null;
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
private String RECAL_FILE = null;
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
private SAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
private boolean USE_SLX_PLATFORM = false;
@Argument(shortName="rawQempirical", doc="If provided, we will use raw Qempirical scores calculated from the # mismatches and # bases, rather than the more conservative estimate of # mismatches + 1 / # bases + 1", required=false)
public boolean rawQempirical = false;
//public enum RecalibrationMode {
// COMBINATORIAL,
// SEQUENTIAL,
// ERROR
//}
@Argument(fullName="preserveQScoresLessThan", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
public int preserveQScoresLessThan = 5;
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
private String MODE_STRING = "SEQUENTIAL";
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
@Argument(fullName = "useOriginalQuals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean useOriginalQuals = false;
//
// Basic static information
//
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
private static String VERSION = "0.2.5";
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
public enum RecalibrationMode {
COMBINATORIAL,
JAFFE,
BY_POS_ONLY,
BY_DINUC_ONLY,
SEQUENTIAL,
ERROR
}
@Argument(shortName="mode", doc="", required=false)
public String modeString = RecalibrationMode.SEQUENTIAL.toString();
public RecalibrationMode mode = RecalibrationMode.ERROR;
private RecalDataManager dataManager;
private ArrayList<Covariate> requestedCovariates;
private ArrayList<Comparable> fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values
private ArrayList<Comparable> collapsedTableKey; // the key that will be used over and over again to query the collapsed tables
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
/**
* Read in the recalibration table input file.
* Parse the list of covariate classes used during CovariateCounterWalker.
* Parse the CSV data and populate the hashmap.
*/
public void initialize() {
logger.info("TableRecalibrator version: " + VERSION);
//
// crappy hack until Enum arg types are supported
//
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
if ( potential.toString().equals(modeString)) {
mode = potential;
break;
}
}
if ( mode == RecalibrationMode.ERROR )
throw new RuntimeException("Unknown mode requested: " + modeString);
logger.info( "TableRecalibrationWalker version: " + versionNumber );
// Get a list of all available covariates
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
//
// Walk over the data file, parsing it into recalibration data
//
int lineNumber = 0;
try {
logger.info(String.format("Reading data..."));
List<RecalData> data = new ArrayList<RecalData>();
boolean collapsedPos = false;
boolean collapsedDinuc = false;
boolean foundAllCovariates = false;
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
//List<String> lines = new xReadLines(new File(paramsFile)).readLines();
for ( String line : new xReadLines(new File(paramsFile)) ) {
// Warn the user if a dbSNP file was specified since it isn't being used here
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
foundDBSNP = true;
}
}
if( foundDBSNP ) {
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
}
fullCovariateKey = new ArrayList<Comparable>();
collapsedTableKey = new ArrayList<Comparable>();
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
// Read in the data from the csv file and populate the map
logger.info( "Reading in the data from input file..." );
try {
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
lineNumber++;
if ( HEADER_PATTERN.matcher(line).matches() )
continue;
if ( COMMENT_PATTERN.matcher(line).matches() ) {
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // skip over the comment lines, (which start with '#')
}
else {
data.add(RecalData.fromCSVString(line));
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
} else { // found another covariate in input file
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
if( !foundClass ) {
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." );
}
}
} else { // found some data
if( !foundAllCovariates ) {
if( VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new CycleCovariate() );
requestedCovariates.add( new DinucCovariate() );
}
foundAllCovariates = true;
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
final boolean createCollapsedTables = true;
// Initialize the data hashMaps
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
}
addCSVData(line); // parse the line and add the data to the HashMap
}
}
initializeCache(data, collapsedPos, collapsedDinuc);
} catch ( FileNotFoundException e ) {
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
Utils.scareUser("Can not find input file: " + RECAL_FILE);
} catch ( NumberFormatException e ) {
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e);
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
}
}
logger.info( "...done!" );
private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
Matcher m = pat.matcher(line);
if ( m.matches() ) {
flag = Boolean.parseBoolean(m.group(1));
}
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
return flag;
}
private void initializeCache(List<RecalData> data, boolean collapsedPos, boolean collapsedDinuc ) {
Set<String> readGroups = new HashSet<String>();
Set<String> dinucs = new HashSet<String>();
int maxPos = -1;
int maxQReported = -1;
logger.info(String.format("No. params : %d", data.size()));
// get the maximum data from the file
for ( RecalData datum : data ) {
readGroups.add(datum.readGroup);
dinucs.add(datum.dinuc);
maxPos = Math.max(maxPos, datum.pos);
maxQReported = Math.max(maxQReported, datum.qual);
}
logger.info(String.format("Read groups : %d %s", readGroups.size(), readGroups.toString()));
logger.info(String.format("Dinucs : %d %s", dinucs.size(), dinucs.toString()));
logger.info(String.format("Max pos : %d", maxPos));
logger.info(String.format("Max Q reported : %d", maxQReported));
// check the mode
switch ( mode ) {
case JAFFE:
collapsedPos = collapsedDinuc = true;
break;
case BY_POS_ONLY:
if ( collapsedPos )
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
collapsedPos = true;
throw new RuntimeException("Unsupported mode requested, sorry");
//break;
case BY_DINUC_ONLY:
if ( collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
collapsedDinuc = true;
throw new RuntimeException("Unsupported mode requested, sorry");
//break;
case COMBINATORIAL:
if ( collapsedPos || collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
case SEQUENTIAL:
if ( collapsedPos || collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform sequential recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
}
//
// initialize the data structures
//
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
for ( String readGroup : readGroups ) {
// create a manager for each read group
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc);
managers.put(readGroup, manager);
}
// add the data to their appropriate readGroup managers
for ( RecalData datum : data ) {
managers.get(datum.readGroup).addDatum(datum);
}
// fill in the table with mapping objects
for ( String readGroup : readGroups ) {
RecalDataManager manager = managers.get(readGroup);
RecalMapping mapper = initializeMapper(manager, rawQempirical, dinucs, maxPos, maxQReported);
logger.info(String.format("Creating mapper for %s of class %s", readGroup, mapper));
cache.put(readGroup, mapper);
// Create the collapsed tables that are used in the sequential calculation
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
logger.info( "...done!" );
}
}
/**
* Returns a new RecalMapping object appropriate for the requested mode in this.mode for the RecalData
* in manager. Uses the dinucs set dinucs, maxPos, and maxQReported to build efficient lookup structures
* for mapping from old qual and features -> new qual
*
* @param manager
* @param dinucs
* @param maxPos
* @param maxQReported
* @return
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
* @param line A line of CSV data read from the recalibration table data file
*/
private RecalMapping initializeMapper( RecalDataManager manager, final boolean rawQempirical,
Set<String> dinucs, int maxPos, int maxQReported ) {
switch ( mode ) {
case COMBINATORIAL:
case JAFFE:
return new CombinatorialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
case SEQUENTIAL:
return new SerialRecalMapping(manager, rawQempirical, dinucs, maxPos, maxQReported);
default:
throw new RuntimeException("Unimplemented recalibration mode: " + mode);
private void addCSVData(String line) {
String[] vals = line.split(",");
ArrayList<Comparable> key = new ArrayList<Comparable>();
Covariate cov;
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
if( VALIDATE_OLD_RECALIBRATOR ) {
if( iii == 1 ) { // order is different in the old recalibrator
key.add( cov.getValue( vals[2] ) );
} else if ( iii == 2 ) {
key.add( cov.getValue( vals[1] ) );
} else {
key.add( cov.getValue( vals[iii] ) );
}
} else {
key.add( cov.getValue( vals[iii] ) );
}
}
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
dataManager.addToAllTables( key, datum );
}
public SAMRecord map(char[] ref, SAMRecord read) {
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
/**
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
* @param refBases References bases over the length of the read
* @param read The read to be recalibrated
* @return The read with quality scores replaced
*/
public SAMRecord map( char[] refBases, SAMRecord read ) {
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code
byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
originalQuals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
byte[] recalQuals = originalQuals.clone();
// These calls are all expensive so only do them once for each read
final SAMReadGroupRecord readGroup = read.getReadGroup();
final String readGroupId = readGroup.getReadGroupId();
String platform = readGroup.getPlatform();
final boolean isNegStrand = read.getReadNegativeStrandFlag();
if( USE_SLX_PLATFORM ) {
platform = "ILLUMINA";
}
byte[] bases = read.getReadBases();
int startPos = 1;
int stopPos = bases.length;
byte[] quals = RecalDataManager.getQualsForRecalibration(read, useOriginalQuals);
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
if (read.getReadNegativeStrandFlag()) {
bases = BaseUtils.simpleReverseComplement(bases);
quals = BaseUtils.reverse(quals);
if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
startPos = 0;
stopPos = bases.length - 1;
}
try {
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
// Don't change Q scores below some threshold
preserveQScores(quals, recalQuals, read);
// For each base in the read
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
if (read.getReadNegativeStrandFlag()) { // reverse the quals for the neg strand read
recalQuals = BaseUtils.reverse(recalQuals);
quals = BaseUtils.reverse(quals);
}
read.setBaseQualities(recalQuals);
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) {
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(quals));
// Get the covariate values which make up the key
for( Covariate covariate : requestedCovariates ) {
fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here
}
return read;
} catch ( StingException e ) {
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey );
fullCovariateKey.clear();
}
}
private void preserveQScores( byte[] originalQuals, byte[] recalQuals, SAMRecord read ) {
for ( int i = 0; i < recalQuals.length; i++ ) {
if ( originalQuals[i] < preserveQScoresLessThan ) {
//System.out.printf("Preserving Q%d base at %d in read %s%n", originalQuals[i], i, read.getReadName());
recalQuals[i] = originalQuals[i];
}
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
if( platform.equalsIgnoreCase("SOLID") ) {
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
}
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}
return read;
}
/**
* Workhorse routine. Given a read group and an array of bases and quals, returns a new set of recalibrated
* qualities for each base/qual in bases and quals. Uses the RecalMapping object associated with readGroup.
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
*
* @param readGroup
* @param bases
* @param quals
* @return
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) throws StingException {
if ( readGroup == null ) { throw new StingException("BUG: read group is null"); }
if ( bases == null ) { throw new StingException("BUG: bases array is null"); }
if ( bases.length == 0) { throw new StingException("BUG: bases array is size 0"); }
if ( quals == null ) { throw new StingException("BUG: quals array is null"); }
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
byte[] recalQuals = new byte[quals.length];
RecalMapping mapper = cache.get(readGroup);
if ( mapper == null ) {
// throw new StingException(String.format("BUG: couldn't find RecalMapping for readgroup %s", readGroup));
// Edited by ebanks on 8/9/09
// Actually, there is a valid case when the mapper == null: when all reads in a RG are
// unmapped or have mapping quality == 0. In this case, we don't want to die - nor do we
// want to lose these reads - so we just don't alter the quals
for ( int cycle = 0; cycle < bases.length; cycle++ )
recalQuals[cycle] = quals[cycle];
return recalQuals;
String readGroupKeyElement = key.get(0).toString();
int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
byte qualFromRead = (byte)qualityScoreKeyElement;
// The global quality shift (over the read group only)
collapsedTableKey.add( readGroupKeyElement );
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey);
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey);
double globalDeltaQ = 0.0;
double aggregrateQreported = 0.0;
if( globalDeltaQDatum != null ) {
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) );
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
}
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
byte qual = quals[cycle];
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
if ( newQual <= 0 || newQual > QualityUtils.MAX_REASONABLE_Q_SCORE )
throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual));
recalQuals[cycle] = newQual;
//System.out.printf("Mapping %d => %d%n", qual, newQual);
// The shift in quality between reported and empirical
collapsedTableKey.add( qualityScoreKeyElement );
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey);
double deltaQReported = 0.0;
if( deltaQReportedEmpirical != null ) {
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
}
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
Double deltaQCovariateEmpirical;
double deltaQPos = 0.0;
double deltaQDinuc = 0.0;
for( int iii = 2; iii < key.size(); iii++ ) {
collapsedTableKey.add( key.get(iii) ); // the given covariate
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
if( deltaQCovariateEmpirical != null ) {
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
if( VALIDATE_OLD_RECALIBRATOR ) {
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
}
}
collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list
}
return recalQuals;
}
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
byte newQualityByte = QualityUtils.boundQual( (int)Math.round(newQuality), QualityUtils.MAX_REASONABLE_Q_SCORE );
public SAMFileWriter reduceInit() {
return outputBam;
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
//}
//else {
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
//}
collapsedTableKey.clear();
return newQualityByte;
}
/**
* Write out the read
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
recalQuals[iii] = originalQuals[iii];
}
}
}
/**
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
* @param colorSpaceQuals The list of color space quality scores for this read
*/
private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
recalQuals[iii] = originalQuals[iii];
}
}
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
/**
* Nothing start the reduce with a new handle to the output bam file
* @return A FileWriter pointing to a new bam file
*/
public SAMFileWriter reduceInit() {
return OUTPUT_BAM;
}
/**
* Output each read to disk
* @param read The read to output
* @param output The FileWriter to write the read to
* @return The FileWriter
*/
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
if ( output != null ) {
output.addAlignment(read);
} else {
@ -337,191 +453,3 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
return output;
}
}
interface RecalMapping {
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual);
}
class CombinatorialRecalMapping implements RecalMapping {
ArrayList<byte[][]> cache;
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
Set<String> dinucs, int maxPos, int maxQReported ) {
this.manager = manager;
// initialize the data structure
cache = new ArrayList<byte[][]>(RecalData.NDINUCS);
for ( String dinuc : dinucs ) {
cache.add(new byte[maxPos+1][maxQReported+1]);
}
for ( RecalData datum : manager.getAll() ) {
//System.out.printf("Adding datum %s%n", datum);
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
int pos = manager.canonicalPos(datum.pos);
if ( table[pos][datum.qual] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
}
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
int pos = manager.canonicalPos(cycle);
int index = this.manager.getDinucIndex(prevBase, base);
byte[][] dataTable = index == -1 ? null : cache.get(index);
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
}
}
/**
* Implements a serial recalibration of the reads using the combinational table. First,
* we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc)
*/
class SerialRecalMapping implements RecalMapping {
private double globalDeltaQ = 0.0;
private double[][] deltaQPosMap, deltaQDinucMap;
double [] deltaQualMap;
RecalData [][] qPosSupports = null, qDinucSupports = null;
CombinatorialRecalMapping combiMap;
RecalDataManager manager;
String dinucToLookAt = null; // "CC";
int posToLookAt = -1;
int qualToLookAt = 25;
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
Set<String> dinucs, int maxPos, int maxQReported ) {
//combiMap = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported );
this.manager = manager;
// initialize the data structure
{
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
//System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
}
for ( RecalData datum : manager.getAll() ) {
if ( printStateP(datum) )
System.out.printf("PrintValue: %s%n", datum);
}
// Jaffe-style
deltaQualMap = new double[maxQReported+1];
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
//System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
}
// calculate the delta Q pos array
deltaQPosMap = new double[maxPos+1][maxQReported+1];
//qPosSupports = new RecalData[maxPos+1][maxQReported+1];
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
}
// calculate the delta Q dinuc array
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
//qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
}
validateTables(maxPos, maxQReported, dinucs.size(), deltaQPosMap, deltaQDinucMap, useRawQempirical, manager.getAll());
}
private void validateTables(int maxPos, int maxQReported, int nDinucs,
double[][] deltaQPosMap, double[][] deltaQDinucMap,
final boolean useRawQempirical, List<RecalData> data ) {
for ( int i = 0; i < maxPos; i++ ) {
for ( int j = 0; j < maxQReported; j++ ) {
if ( printStateP(i, null, j) )
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null);
}
}
for ( int i = 0; i < nDinucs; i++ ) {
for ( int j = 0; j < maxQReported; j++ ) {
String dinuc = RecalData.dinucIndex2bases(i);
if ( printStateP(0, dinuc, j ) )
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null);
}
}
for ( RecalData datum : RecalData.sort(data) ) {
byte newQual = getNewQual(datum.readGroup, (byte)datum.dinuc.charAt(0), (byte)datum.dinuc.charAt(1), datum.pos, (byte)datum.qual);
if ( printStateP( datum ) ) {
System.out.printf("Serial mapping %s => %d => %d vs. %d => delta = %d%n",
datum, newQual, datum.empiricalQualByte(useRawQempirical), datum.empiricalQualByte(), newQual - datum.empiricalQualByte(useRawQempirical));
}
}
}
private void updateCache( RecalData[][] supports, RecalData datum, final boolean useRawQempirical,
double[][] table, int i, int j, double meanQ ) {
if ( table[i][j] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
table[i][j] = deltaQ;
if ( supports != null )
supports[i][j] = datum;
}
private boolean printStateP( int cycle, String dinuc, int qual ) {
return posToLookAt != -1 &&
( cycle == 0 || posToLookAt == 0 || cycle == posToLookAt ) &&
( dinucToLookAt == null || dinuc == null || manager.getDinucIndex(dinuc) == -1 || dinucToLookAt.equals(dinuc)) &&
( qualToLookAt == 0 || qual == qualToLookAt );
}
private boolean printStateP( RecalData datum ) {
return printStateP(datum.pos, datum.dinuc, datum.qual);
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
int pos = manager.canonicalPos(cycle);
int index = this.manager.getDinucIndex(prevBase, base);
byte newQualByte = qual;
if ( qual < deltaQualMap.length ) {
// it's possible to see a qual not in the recal table -- i.e., we only see Q28 at a dbSNP site
double deltaQual = deltaQualMap[qual];
double deltaQPos = pos >= deltaQPosMap.length ? 0.0 : deltaQPosMap[pos][qual]; // == length indices no data for last base
double deltaQDinuc = index == -1 ? 0.0 : deltaQDinucMap[index][qual]; // -1 indices N in prev or current base
double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc;
newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE);
//System.out.println(String.format("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d",
// readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc, newQualByte));
if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE )
throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d",
readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte));
}
return newQualByte;
}
}

View File

@ -1,484 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RODRecordList;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal operating only at sites that are not in dbSNP.
* We assume that all mismatches we see are therefore errors.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Since there is a large amount of data one can then calculate an empirical probability of error
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
* The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score)
* The first lines of the output file give the name of the covariate classes that were used for this calculation.
*
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list.
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
*/
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@WalkerName( "CountCovariatesRefactored" )
//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private Boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
private boolean NO_PRINT_HEADER = false;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
private boolean USE_SLX_PLATFORM = false;
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
private int sizeOfReadDatumHashMap = 0;
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
//private final String versionNumber = "2.0.0"; // major version, minor version, and build number
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
/**
* Parse the -cov arguments and create a list of covariates to be used here
* Based on the covariates' estimates for initial capacity allocate the data hashmap
*/
public void initialize() {
//logger.info( "CovariateCounterWalker version: " + versionNumber );
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println( "Available covariates:" );
for( Class<?> covClass : classes ) {
out.println( covClass.getSimpleName() );
}
out.println();
System.exit( 0 ); // early exit here because user requested it
}
// Warn the user if no dbSNP file was specified
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
foundDBSNP = true;
}
}
if( !foundDBSNP ) {
Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?");
}
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
if( VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() );
} else if( COVARIATES != null ) {
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
requestedCovariates.add( new QualityScoreCovariate() );
for( Class<?> covClass : classes ) {
try {
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
requestedCovariates.add( covariate );
}
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
} else { // The user has specified a list of several covariates
int covNumber = 1;
for( String requestedCovariateString : COVARIATES ) {
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
}
covNumber++;
try {
// Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance();
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
if( !foundClass ) {
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
}
}
}
} else { // No covariates were specified by the user so add the default, required ones
Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." );
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
estimatedCapacity = 300 * 40;
}
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
dataManager = new RecalDataManager( estimatedCapacity );
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
* @param tracker The reference metadata tracker
* @param ref The reference context
* @param context The alignment context
* @return Returns 1, but this value isn't used in the reduce step
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
// pull out anything passed by -B name,type,file that has the name "dbsnp"
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
boolean isSNP = false;
if (dbsnpRODs != null) {
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
if( ((Variation)rod).isSNP() ) {
isSNP = true; // at least one of the rods says this is a snp site
break;
}
}
}
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
if( !isSNP ) {
final List<SAMRecord> reads = context.getReads();
final List<Integer> offsets = context.getOffsets();
SAMRecord read;
int offset;
String readGroupId;
byte[] quals;
byte[] bases;
byte refBase;
byte prevBase;
String platform;
byte[] colorSpaceQuals;
ReadHashDatum readDatum;
boolean isNegStrand;
int mappingQuality;
int length;
final int numReads = reads.size();
// For each read at this locus
for( int iii = 0; iii < numReads; iii++ ) {
read = reads.get(iii);
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
readDatum = readDatumHashMap.get( read );
if( readDatum == null ) {
// If the HashMap of read objects has grown too large then throw out the (mostly stale) reads
if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger?
readDatumHashMap.clear();
sizeOfReadDatumHashMap = 0;
}
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
isNegStrand = read.getReadNegativeStrandFlag();
final SAMReadGroupRecord readGroup = read.getReadGroup();
readGroupId = readGroup.getReadGroupId();
platform = readGroup.getPlatform();
mappingQuality = read.getMappingQuality();
length = bases.length;
if( USE_SLX_PLATFORM ) {
platform = "ILLUMINA";
}
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
readDatumHashMap.put( read, readDatum );
sizeOfReadDatumHashMap++;
}
if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
// skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < readDatum.length - 1 ) {
// skip if base quality is zero
if( readDatum.quals[offset] > 0 ) {
refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
colorSpaceQuals = null;
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// This base finally passed all the checks, so add it to the big hashmap
updateDataFromRead( readDatum, offset, refBase );
}
} else {
if( VALIDATE_OLD_RECALIBRATOR ) {
countedBases++; // replicating a small bug in the old recalibrator
}
}
}
} else { // at the last base in the read so we can remove it from our IdentityHashMap
readDatumHashMap.remove( read );
sizeOfReadDatumHashMap--;
}
}
}
}
countedSites++;
} else { // We skipped over the dbSNP site
skippedSites++;
}
return 1; // This value isn't actually used anywhere
}
/**
* Major workhorse routine for this walker.
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation.
* @param readDatum The ReadHashDatum holding all the important properties of this read
* @param offset The offset in the read for this locus
* @param refBase The reference base at this locus
*/
private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) {
List<Comparable> key = new ArrayList<Comparable>();
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( readDatum, offset ) );
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
RecalDatum datum = dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( VALIDATE_OLD_RECALIBRATOR ) {
dataManager.data.myPut( key, datum );
} else {
dataManager.data.put( key, datum );
}
}
// Need the bases to determine whether or not we have a mismatch
byte base = readDatum.bases[offset];
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
countedBases++;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
/**
* Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker.
* @return returns A PrintStream created from the -rf filename
*/
public PrintStream reduceInit() {
try {
return new PrintStream( RECAL_FILE );
} catch ( FileNotFoundException e ) {
throw new RuntimeException( "Couldn't open output file: ", e );
}
}
/**
* The Reduce method doesn't do anything for this walker.
* @param value Result of the map. This value is immediately ignored.
* @param recalTableStream The PrintStream used to output the CSV data
* @return returns The PrintStream used to output the CSV data
*/
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
return recalTableStream; // nothing to do here
}
/**
* Write out the full data hashmap to disk in CSV format
* @param recalTableStream The PrintStream to write out to
*/
public void onTraversalDone( PrintStream recalTableStream ) {
logger.info( "Writing raw recalibration data..." );
outputToCSV( recalTableStream );
logger.info( "...done!" );
recalTableStream.close();
}
/**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
* @param recalTableStream The PrintStream to write out to
*/
private void outputToCSV( final PrintStream recalTableStream ) {
if( VALIDATE_OLD_RECALIBRATOR ) {
recalTableStream.printf("# collapsed_pos false%n");
recalTableStream.printf("# collapsed_dinuc false%n");
recalTableStream.printf("# counted_sites %d%n", countedSites);
recalTableStream.printf("# counted_bases %d%n", countedBases);
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n");
// For each entry in the data hashmap
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
// For each Covariate in the key
for( Comparable comp : entry.first ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.second.outputToCSV() );
}
} else {
if( !NO_PRINT_HEADER ) {
recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
for( Covariate cov : requestedCovariates ) {
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
}
}
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
}
}
}

View File

@ -1,204 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 6, 2009
*
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
*/
public class RecalDataManager {
public NHashMap<RecalDatum> data; // the full dataset
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
public NHashMap<Double> dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed
private NHashMap<Double> dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
private NHashMap<Double> dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams
RecalDataManager() {
data = new NHashMap<RecalDatum>();
}
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
if( createCollapsedTables ) { // initialize all the collapsed tables
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
}
dataSumExpectedErrors = new NHashMap<Double>();
} else {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
}
}
RecalDataManager( final int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap
}
/**
* Add the given mapping to all of the collapsed hash tables
* @param key The list of comparables that is the key for this mapping
* @param fullDatum The RecalDatum which is the data for this mapping
*/
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
// create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
newKey = new ArrayList<Comparable>();
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score
collapsedDatum = dataCollapsedQualityScore.get( newKey );
if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score ...
newKey.add( key.get(iii + 2) ); // and the given covariate
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
if( collapsedDatum == null ) {
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
}
}
/**
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
* that will be used in the sequential calculation in TableRecalibrationWalker
* @param numCovariates The number of covariates you have determines the number of tables to create
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
*/
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
dataCollapsedReadGroupDouble = new NHashMap<Double>();
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
}
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
// Looping over the entrySet is really expensive but worth it
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ));
}
}
dataCollapsedQualityScore.clear();
dataCollapsedByCovariate.clear();
dataCollapsedQualityScore = null; // will never need this again
dataCollapsedByCovariate = null; // will never need this again
if( data!=null ) {
data.clear();
data = null; // will never need this again
}
}
/**
* Get the appropriate collapsed table out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
*/
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroup; // table where everything except read group has been collapsed
} else if( covariate == 1 ) {
return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
} else {
return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed
}
}
/**
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
* @return The desired collapsed NHashMap<Double>
*/
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed
} else if( covariate == 1 ) {
return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed
} else {
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
}
}
}

View File

@ -1,454 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
* This walker then outputs a new bam file with these updated (recalibrated) reads.
*
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
*/
@WalkerName("TableRecalibrationRefactored")
@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the
// GATK to not actually spend time giving me the refBase array since I don't use it
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
private String RECAL_FILE = null;
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
private SAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
private boolean USE_SLX_PLATFORM = false;
//public enum RecalibrationMode {
// COMBINATORIAL,
// SEQUENTIAL,
// ERROR
//}
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
private String MODE_STRING = "SEQUENTIAL";
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
private RecalDataManager dataManager;
private ArrayList<Covariate> requestedCovariates;
private ArrayList<Comparable> fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values
private ArrayList<Comparable> collapsedTableKey; // the key that will be used over and over again to query the collapsed tables
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
//private final String versionNumber = "2.0.0"; // major version, minor version, and build number
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
/**
* Read in the recalibration table input file.
* Parse the list of covariate classes used during CovariateCounterWalker.
* Parse the CSV data and populate the hashmap.
*/
public void initialize() {
//logger.info( "TableRecalibrationWalker version: " + versionNumber );
// Get a list of all available covariates
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
int lineNumber = 0;
boolean foundAllCovariates = false;
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
// Warn the user if a dbSNP file was specified since it isn't being used here
boolean foundDBSNP = false;
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod.getName().equalsIgnoreCase( "dbsnp" ) ) {
foundDBSNP = true;
}
}
if( foundDBSNP ) {
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
}
fullCovariateKey = new ArrayList<Comparable>();
collapsedTableKey = new ArrayList<Comparable>();
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
// Read in the data from the csv file and populate the map
logger.info( "Reading in the data from input file..." );
try {
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
lineNumber++;
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // skip over the comment lines, (which start with '#')
}
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
} else { // found another covariate in input file
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) );
}
}
}
if( !foundClass ) {
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + line + ") isn't a valid covariate option." );
}
}
} else { // found some data
if( !foundAllCovariates ) {
if( VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new CycleCovariate() );
requestedCovariates.add( new DinucCovariate() );
}
foundAllCovariates = true;
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
final boolean createCollapsedTables = true;
// Initialize the data hashMaps
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
}
addCSVData(line); // parse the line and add the data to the HashMap
}
}
} catch ( FileNotFoundException e ) {
Utils.scareUser("Can not find input file: " + RECAL_FILE);
} catch ( NumberFormatException e ) {
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
}
logger.info( "...done!" );
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
// Create the collapsed tables that are used in the sequential calculation
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
logger.info( "...done!" );
}
}
/**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
* @param line A line of CSV data read from the recalibration table data file
*/
private void addCSVData(String line) {
String[] vals = line.split(",");
ArrayList<Comparable> key = new ArrayList<Comparable>();
Covariate cov;
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
if( VALIDATE_OLD_RECALIBRATOR ) {
if( iii == 1 ) { // order is different in the old recalibrator
key.add( cov.getValue( vals[2] ) );
} else if ( iii == 2 ) {
key.add( cov.getValue( vals[1] ) );
} else {
key.add( cov.getValue( vals[iii] ) );
}
} else {
key.add( cov.getValue( vals[iii] ) );
}
}
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
dataManager.addToAllTables( key, datum );
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
/**
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
* @param refBases References bases over the length of the read
* @param read The read to be recalibrated
* @return The read with quality scores replaced
*/
public SAMRecord map( char[] refBases, SAMRecord read ) {
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code
byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
originalQuals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
byte[] recalQuals = originalQuals.clone();
// These calls are all expensive so only do them once for each read
final SAMReadGroupRecord readGroup = read.getReadGroup();
final String readGroupId = readGroup.getReadGroupId();
String platform = readGroup.getPlatform();
final boolean isNegStrand = read.getReadNegativeStrandFlag();
if( USE_SLX_PLATFORM ) {
platform = "ILLUMINA";
}
byte[] bases = read.getReadBases();
int startPos = 1;
int stopPos = bases.length;
if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
startPos = 0;
stopPos = bases.length - 1;
}
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
// For each base in the read
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
// Get the covariate values which make up the key
for( Covariate covariate : requestedCovariates ) {
fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here
}
recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey );
fullCovariateKey.clear();
}
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
if( platform.equalsIgnoreCase("SOLID") ) {
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
}
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}
return read;
}
/**
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
String readGroupKeyElement = key.get(0).toString();
int qualityScoreKeyElement = Integer.parseInt(key.get(1).toString());
byte qualFromRead = (byte)qualityScoreKeyElement;
// The global quality shift (over the read group only)
collapsedTableKey.add( readGroupKeyElement );
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get(collapsedTableKey);
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get(collapsedTableKey);
double globalDeltaQ = 0.0;
double aggregrateQreported = 0.0;
if( globalDeltaQDatum != null ) {
aggregrateQreported = QualityUtils.phredScaleErrorRate( dataManager.dataSumExpectedErrors.get(collapsedTableKey) / ((double) globalDeltaQDatum.getNumObservations()) );
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
}
// The shift in quality between reported and empirical
collapsedTableKey.add( qualityScoreKeyElement );
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get(collapsedTableKey);
double deltaQReported = 0.0;
if( deltaQReportedEmpirical != null ) {
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
}
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
Double deltaQCovariateEmpirical;
double deltaQPos = 0.0;
double deltaQDinuc = 0.0;
for( int iii = 2; iii < key.size(); iii++ ) {
collapsedTableKey.add( key.get(iii) ); // the given covariate
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
if( deltaQCovariateEmpirical != null ) {
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
if( VALIDATE_OLD_RECALIBRATOR ) {
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
}
}
collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list
}
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
byte newQualityByte = QualityUtils.boundQual( (int)Math.round(newQuality), QualityUtils.MAX_REASONABLE_Q_SCORE );
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
//}
//else {
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
//}
collapsedTableKey.clear();
return newQualityByte;
}
/**
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
recalQuals[iii] = originalQuals[iii];
}
}
}
/**
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
* @param colorSpaceQuals The list of color space quality scores for this read
*/
private void preserveBadColorSpaceQualities_SOLID( final byte[] originalQuals, byte[] recalQuals, final byte[] colorSpaceQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( colorSpaceQuals[iii] <= 0 ) { //BUGBUG: This isn't exactly correct yet
recalQuals[iii] = originalQuals[iii];
}
}
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
/**
* Nothing start the reduce with a new handle to the output bam file
* @return A FileWriter pointing to a new bam file
*/
public SAMFileWriter reduceInit() {
return OUTPUT_BAM;
}
/**
* Output each read to disk
* @param read The read to output
* @param output The FileWriter to write the read to
* @return The FileWriter
*/
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
if ( output != null ) {
output.addAlignment(read);
} else {
out.println(read.format());
}
return output;
}
}

View File

@ -1,184 +0,0 @@
// our package
package org.broadinstitute.sting.gatk.walkers.recalibration;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
// the imports for unit testing.
import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.junit.BeforeClass;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.HashSet;
import java.io.FileNotFoundException;
import java.io.File;
/**
* Basic unit test for RecalData
*/
public class CovariateCounterTest extends BaseTest {
String readGroup1 = "rg1";
String readGroup2 = "rg2";
Set<String> readGroups = new HashSet<String>();
SAMFileHeader header;
SAMRecord read1, read2, read3;
byte bases1[] = {'a', 't', 'c', 'g', 'a'};
byte quals1[] = {1, 2, 3, 4, 5};
byte quals3[] = {1, 2, 5, 5, 5};
byte bases2[] = {'t', 'c', 'g', 'a', 't'};
byte quals2[] = {2, 2, 4, 5, 2};
/*
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
public Set<String> getReadGroups() {
public boolean isCollapseDinuc() {
public boolean isCollapsePos() {
public int getNReadGroups() {
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
public List<RecalData> getRecalData(String readGroup) {
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
*/
/**
* The fasta, for comparison.
*/
protected static IndexedFastaSequenceFile sequenceFile = null;
CovariateCounter c;
/**
* Initialize the fasta.
*/
@BeforeClass
public static void initialize() throws FileNotFoundException {
sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") );
GenomeLocParser.setupRefContigOrdering(sequenceFile);
}
@Before
public void initializeBefore() {
header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719);
readGroups.addAll(Arrays.asList(readGroup1, readGroup2));
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" );
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" );
c = new CovariateCounter( readGroups, false, false, false );
read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1);
read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2);
read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3);
}
@Test
public void testCovariateCounterSetup() {
Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2);
Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups);
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false);
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false);
}
@Test
public void testOneRead() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
c.printState();
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, 1);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test
public void testTwoReads() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
for ( int i = 1; i < read2.getReadBases().length; i++ )
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i], false);
c.printState();
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, 1);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test
public void testTwoReadsSameGroup() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
for ( int i = 1; i < read2.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
c.printState();
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, 2);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test
public void testTwoReadsSameGroupNotIdentical() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i], false);
for ( int i = 1; i < read3.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i], false);
c.printState();
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test (expected = RuntimeException.class)
public void testBadReadOffset() {
byte bases[] = {'a', 't', 'c', 'g', 'a'};
byte quals[] = {1, 2, 3, 4, 5};
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0], false);
}
}

View File

@ -1,183 +0,0 @@
// our package
package org.broadinstitute.sting.gatk.walkers.recalibration;
// the imports for unit testing.
import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.HashSet;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Basic unit test for RecalData
*/
public class RecalDataTest extends BaseTest {
RecalData datum, starDatum;
/**
* Tests that we got a string parameter in correctly
*/
@Before
public void before() {
datum = new RecalData(2, 3, "0", "AA");
datum.B = datum.N = 1;
starDatum = new RecalData(2, 3, "0", "**");
starDatum.B = starDatum.N = 1;
}
@Test
public void testBasic() {
logger.warn("Executing testIsBetween");
Assert.assertTrue(datum.N == 1);
Assert.assertTrue(datum.B == 1);
Assert.assertTrue(datum.pos == 2);
Assert.assertTrue(datum.qual == 3);
Assert.assertTrue(datum.readGroup.equals("0"));
Assert.assertTrue(datum.dinuc.equals("AA"));
Assert.assertTrue(starDatum.N == 1);
Assert.assertTrue(starDatum.B == 1);
Assert.assertTrue(starDatum.pos == 2);
Assert.assertTrue(starDatum.qual == 3);
Assert.assertTrue(starDatum.readGroup.equals("0"));
Assert.assertTrue(starDatum.dinuc.equals("**"));
}
@Test
public void testInc() {
logger.warn("Executing testInc");
datum.inc(1L, 0L);
Assert.assertTrue(datum.N == 2);
Assert.assertTrue(datum.B == 1);
datum.inc('A', 'A');
Assert.assertTrue(datum.N == 3);
Assert.assertTrue(datum.B == 1);
datum.inc('A', 'C');
Assert.assertTrue(datum.N == 4);
Assert.assertTrue(datum.B == 2);
}
@Test
public void testEmpQual() {
logger.warn("Executing testEmpQual");
datum.B = 1;
datum.N = 1;
Assert.assertEquals(datum.empiricalQualDouble(), 0.0, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 1);
datum.B = 1;
datum.N = 2;
Assert.assertEquals(datum.empiricalQualDouble(), 3.0103, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 3);
datum.B = 1;
datum.N = 3;
Assert.assertEquals(datum.empiricalQualDouble(), 4.771213, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 5);
datum.B = 1;
datum.B = 2;
Assert.assertEquals(datum.empiricalQualDouble(), 1.760913, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 2);
datum.B = 1;
datum.N = 10;
Assert.assertEquals(datum.empiricalQualDouble(), 10.0, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 10);
datum.B = 1;
datum.N = 100;
Assert.assertEquals(datum.empiricalQualDouble(), 20.0, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 20);
datum.B = 1;
datum.N = 1000;
Assert.assertEquals(datum.empiricalQualDouble(), 30.0, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), 30);
datum.B = 0;
datum.N = 1000;
Assert.assertEquals(datum.empiricalQualDouble(), QualityUtils.MAX_REASONABLE_Q_SCORE, 1e-5);
Assert.assertEquals(datum.empiricalQualByte(), QualityUtils.MAX_REASONABLE_Q_SCORE);
}
public void testtoCSVString() {
logger.warn("Executing testtoCSVString");
Assert.assertEquals(datum.toCSVString(false), "0,AA,3,2,1,1,0");
Assert.assertEquals(datum.toCSVString(true), "0,AA,3,*,1,1,0");
}
public void testFromCSVString() {
logger.warn("Executing testFromCSVString");
Assert.assertEquals(RecalData.fromCSVString("0,AA,3,2,1,1,0").toCSVString(false), datum.toCSVString(false));
Assert.assertEquals(RecalData.fromCSVString("0,AA,3,*,1,1,0").toCSVString(false), datum.toCSVString(true));
Assert.assertEquals(RecalData.fromCSVString("0,**,3,2,1,1,0").toCSVString(false), starDatum.toCSVString(false));
Assert.assertEquals(RecalData.fromCSVString("0,**,3,*,1,1,0").toCSVString(false), starDatum.toCSVString(true));
}
public void testDinucIndex() {
logger.warn("Executing testDinucIndex");
HashSet<Integer> indices = new HashSet<Integer>();
HashSet<Byte> unknownBytes = new HashSet<Byte>();
byte bases[] = {'A', 'C', 'G', 'T', '*', 'N'};
unknownBytes.add((byte)'*');
unknownBytes.add((byte)'N');
for ( int i = 0; i < bases.length; i++ ) {
for ( int j = 0; j < bases.length; j++ ) {
byte[] bp = {bases[i], bases[j]};
String s = new String(bp);
int index = RecalData.dinucIndex(s);
indices.add(index);
Assert.assertEquals(index, RecalData.dinucIndex(bases[i], bases[j]));
if ( index != -1 ) {
Assert.assertEquals(RecalData.dinucIndex2bases(index), s);
} else {
Assert.assertTrue(unknownBytes.contains(bp[0]) || unknownBytes.contains(bp[1]) );
}
}
}
Assert.assertEquals(indices.size(), 17);
}
}

View File

@ -29,7 +29,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -T CountCovariates" +
" -I " + bam +
" -L 1:10,000,000-11,000,000" +
" --params %s",
" --validate_old_recalibrator" +
" --use_slx_platform" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
List<File> result = executeTest("testCountCovariates1", spec).getFirst();
@ -56,8 +58,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -T TableRecalibration" +
" -I " + bam +
" -L 1:10,000,000-20,000,000" +
" --outputBam %s" +
" --params " + paramsFile,
" --validate_old_recalibrator" +
" --use_slx_platform" +
" -outputBam %s" +
" -recalFile " + paramsFile,
1, // just one output file
Arrays.asList(md5));
executeTest("testTableRecalibrator1", spec);

View File

@ -1,71 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
import java.util.Arrays;
import java.util.List;
import java.io.File;
public class RefactoredRecalibrationWalkersIntegrationTest extends WalkerTest {
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
@Test
public void testCountCovariatesR() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R /broad/1KG/reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T CountCovariatesRefactored" +
" -I " + bam +
" -L 1:10,000,000-11,000,000" +
" --validate_old_recalibrator" +
" --use_slx_platform" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
List<File> result = executeTest("testCountCovariatesR", spec).getFirst();
paramsFiles.put(bam, result.get(0).getAbsolutePath());
}
}
@Test
public void testTableRecalibratorR() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "3ace4b9b8495429cc32e7008145f4996" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" );
//e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
String md5 = entry.getValue();
String paramsFile = paramsFiles.get(bam);
System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile);
if ( paramsFile != null ) {
WalkerTestSpec spec = new WalkerTestSpec(
"-R /broad/1KG/reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T TableRecalibrationRefactored" +
" -I " + bam +
" -L 1:10,000,000-20,000,000" +
" --validate_old_recalibrator" +
" --use_slx_platform" +
" -outputBam %s" +
" -recalFile " + paramsFile,
1, // just one output file
Arrays.asList(md5));
executeTest("testTableRecalibratorR", spec);
}
}
}
}

View File

@ -8,7 +8,6 @@ import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.HashSet;
import java.util.Arrays;

View File

@ -8,7 +8,6 @@ import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;