Added some documentation to the code, mostly especially to CovariateCounterWalker but various comments added throughout. Also changed the HashMap data structure to accept an estimated initial capacity. This had a very modest improvement to the speed.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2001 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-09 20:13:56 +00:00
parent 74751a8ed3
commit 740a5484c4
12 changed files with 554 additions and 95 deletions

View File

@ -2,12 +2,41 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read and corresponding reference bases
*/
public interface Covariate {
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read and etc
public Comparable<?> getValue(String str); // used to get value from input file
public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read and reference bases
public Comparable getValue(String str); // used to get value from input file
public int estimatedNumberOfBins(); // used to estimate the amount space required in the hashmap
}

View File

@ -16,10 +16,47 @@ import java.util.*;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal operating only at sites that are not in dbSNP.
* We assume that all mismatches we see are therefore errors.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Since there is a large amount of data one can then calculate an empirical probability of error
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
* The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score)
* The first lines of the output file give the name of the covariate classes that were used for this calculation.
*
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and are automatically added first.
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
*/
@WalkerName("CountCovariatesRefactored")
@ -36,15 +73,18 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file")
public String RECAL_FILE = "output.recal_data.csv";
protected static RecalDataManager dataManager;
protected static ArrayList<Covariate> requestedCovariates;
protected static RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
protected static ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
/**
* Parse the -cov arguments and create a list of covariates to be used here
* Based on the covariates' estimates for initial capacity allocate the data hashmap
*/
public void initialize() {
dataManager = new RecalDataManager();
// Get a list of all available covariates
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
@ -57,7 +97,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
System.exit( 0 ); // early exit here because user requested it
}
// Initialize the requested covariates
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
requestedCovariates.add( new ReadGroupCovariate() ); // Read Group Covariate is a required covariate for the recalibration calculation
requestedCovariates.add( new QualityScoreCovariate() ); // Quality Score Covariate is a required covariate for the recalibration calculation
@ -67,11 +107,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
if (covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate) {
throw new StingException( "ReadGroupCovariate and QualityScoreCovariate are required covariates and are therefore added for you. Please remove them from the -cov list" );
}
@ -92,27 +134,48 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
dataManager = new RecalDataManager( estimatedCapacity );
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
* @param tracker the reference metadata tracker
* @param ref the reference context
* @param context the alignment context
* @return returns 1, but this value isn't used in the reduce step
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
if( dbsnp == null ) {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
SAMRecord read; // preallocate for use in for loop below
int offset; // preallocate for use in for loop below
SAMRecord read;
int offset;
// For each read at this locus
for( int iii = 0; iii < reads.size(); iii++ ) {
read = reads.get(iii);
offset = offsets.get(iii);
// Only use data from reads with mapping quality above given quality value and base quality greater than zero
// Only use data from reads with mapping quality above specified quality value and base quality greater than zero
byte[] quals = read.getBaseQualities();
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0)
{
if( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they don't have a dinuc count
{
// Skip first and last bases because they don't have a dinuc count
if( offset > 0 && offset < (read.getReadLength() - 1) ) {
updateDataFromRead(read, offset, ref);
}
}
@ -124,43 +187,69 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
return 1;
}
/**
* Major workhorse routine for this walker.
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches
* @param read the read
* @param offset the offset in the read for this locus
* @param ref the reference context
*/
private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) {
List<Comparable<?>> key = new ArrayList<Comparable<?>>();
Comparable<?> keyElement; // preallocate for use in for loop below
List<Comparable> key = new ArrayList<Comparable>();
Comparable keyElement;
boolean badKey = false;
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
keyElement = covariate.getValue( read, offset, ref.getBases() );
if( keyElement != null ) {
key.add( keyElement );
} else {
badKey = true;
badKey = true; // covariate returned bad value, for example dinuc returns null because base = 'N'
}
}
// Using the list of covariate values as a key, pick out the RecalDatum
RecalDatum datum = null;
if( !badKey ) {
datum = dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum();
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
dataManager.data.put( key, datum );
}
}
// Need the bases to determine whether or not we have a mismatch
byte[] bases = read.getReadBases();
char base = (char)bases[offset];
char refBase = ref.getBase();
// Get the complement base strand if we are a negative direction read
if ( read.getReadNegativeStrandFlag() ) {
refBase = BaseUtils.simpleComplement( refBase );
base = BaseUtils.simpleComplement( base );
}
if( datum != null ) {
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( base, refBase );
}
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
/**
* Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker.
* @return returns a PrintStream created from the -rf filename
*/
public PrintStream reduceInit() {
try {
return new PrintStream( RECAL_FILE );
@ -169,10 +258,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
}
/**
* The Reduce method doesn't do anything for this walker.
* @param value result of the map.
* @param recalTableStream the PrintStream
* @return returns the PrintStream used to output the CSV data
*/
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
return recalTableStream; // nothing to do here
}
/**
* Write out the full data hashmap to disk in CSV format
* @param recalTableStream the PrintStream to write out to
*/
public void onTraversalDone( PrintStream recalTableStream ) {
out.print( "Writing raw recalibration data..." );
for( Covariate cov : requestedCovariates ) {
@ -184,16 +283,19 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
recalTableStream.close();
}
/**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
* @param recalTableStream the PrintStream to write out to
*/
private void outputToCSV( PrintStream recalTableStream ) {
for( Map.Entry<List<? extends Comparable<?>>, RecalDatum> entry : dataManager.data.entrySet() ) {
//for( int iii = 0; iii < entry.getKey().size(); iii++ ) {
//index = Integer.parseInt( (entry.getKey().get( iii )).toString());
//recalTableStream.print( requestedCovariates.get( iii ).getValue( index ) + "," );
// recalTableStream.print( entry.getKey().get(iii) )
//}
for( Comparable<?> comp : entry.getKey() ) {
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
}

View File

@ -2,16 +2,42 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*/
public class CycleCovariate implements Covariate {
public String platform;
public CycleCovariate() { // empty constructor is required by CovariateCounterWalker
public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
platform = null;
}
@ -19,8 +45,8 @@ public class CycleCovariate implements Covariate {
platform = _platform;
}
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases) {
//BUGBUG: assumes Solexia platform
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
//BUGBUG: assumes Solexa platform
Integer cycle = offset;
if( read.getReadNegativeStrandFlag() ) {
cycle = read.getReadLength() - (offset + 1);
@ -28,10 +54,14 @@ public class CycleCovariate implements Covariate {
return cycle;
}
public Comparable<?> getValue(String str) {
public Comparable getValue(String str) {
return Integer.parseInt( str );
}
public int estimatedNumberOfBins() {
return 100;
}
public String toString() {
return "Cycle";
}

View File

@ -5,16 +5,42 @@ import org.broadinstitute.sting.utils.BaseUtils;
import java.util.*;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*/
public class DinucCovariate implements Covariate {
public static ArrayList<String> BASES;
public DinucCovariate() { // empty constructor is required by CovariateCounterWalker
public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
BASES = new ArrayList<String>();
BASES.add("A");
BASES.add("G");
@ -26,7 +52,7 @@ public class DinucCovariate implements Covariate {
BASES.add("t");
}
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases) {
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
byte[] bases = read.getReadBases();
char base = (char)bases[offset];
char prevBase = (char)bases[offset - 1];
@ -36,16 +62,20 @@ public class DinucCovariate implements Covariate {
}
// Check if bad base, probably an 'N'
if( !BASES.contains( String.format( "%c", prevBase ) ) || !BASES.contains( String.format( "%c", base) ) ) {
return null; // CovariateCounterWalker and TableRecalibrationWalker will recognize that null means skip this particular location in the read
return null; // CovariateCounterWalker will recognize that null means skip this particular location in the read
} else {
return String.format("%c%c", prevBase, base);
}
}
public Comparable<?> getValue(String str) {
public Comparable getValue(String str) {
return str;
}
public int estimatedNumberOfBins() {
return 16;
}
public String toString() {
return "Dinucleotide";
}

View File

@ -2,23 +2,53 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 4, 2009
*/
public class MappingQualityCovariate implements Covariate {
public MappingQualityCovariate() { // empty constructor is required by CovariateCounterWalker
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases) {
return (Integer)(read.getMappingQuality());
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
}
public Comparable<?> getValue(String str) {
public Comparable getValue(String str) {
return Integer.parseInt( str );
}
public int estimatedNumberOfBins() {
return 100;
}
public String toString() {
return "Mapping Quality Score";

View File

@ -3,17 +3,43 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.QualityUtils;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 4, 2009
*/
public class MinimumNQSCovariate implements Covariate {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
protected boolean USE_ORIGINAL_QUALS;
public MinimumNQSCovariate() { // empty constructor is required by CovariateCounterWalker
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
USE_ORIGINAL_QUALS = false;
}
@ -21,7 +47,7 @@ public class MinimumNQSCovariate implements Covariate {
USE_ORIGINAL_QUALS = originalQuals;
}
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases) {
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
byte[] quals = read.getBaseQualities();
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
@ -41,10 +67,14 @@ public class MinimumNQSCovariate implements Covariate {
return minQual;
}
public Comparable<?> getValue(String str) {
public Comparable getValue(String str) {
return Integer.parseInt( str );
}
public int estimatedNumberOfBins() {
return 40;
}
public String toString() {
return "Minimum Neighborhood Quality Score";
}

View File

@ -2,18 +2,93 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import java.util.*;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*/
public class NHashMap<T> extends HashMap<List<? extends Comparable<?>>, T> {
public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?
private ArrayList<ArrayList<Comparable>> keyLists;
public NHashMap() {
super();
keyLists = new ArrayList<ArrayList<Comparable>>();
}
public static <T extends Comparable<?>> List<T> makeList(T... args) {
public NHashMap( int initialCapacity, float loadingFactor ) {
super( initialCapacity, loadingFactor );
keyLists = new ArrayList<ArrayList<Comparable>>();
}
/*
// This method is here only to help facilitate direct comparison of old and refactored recalibrator.
// The old recalibrator prints out it's mappings in a sorted order but the refactored recalibrator doesn't need to.
// Comment out this overriding method to improve performance
public T put(List<? extends Comparable> key, T value) {
ArrayList<Comparable> thisList;
for( int iii = 0; iii < key.size(); iii++ ) {
thisList = keyLists.get( iii );
if( thisList == null ) {
thisList = new ArrayList<Comparable>();
}
if( !thisList.contains( key.get( iii ) ) ) {
thisList.add( key.get(iii ) );
}
}
return super.put( key, value );
}
// This method is here only to help facilitate direct comparison of old and refactored recalibrator.
// The old recalibrator prints out it's mappings in a sorted order but the refactored recalibrator doesn't need to.
// Comment out this overriding method to improve performance
public Set<Map.Entry<List<? extends Comparable>, T>> entrySet() {
for( ArrayList<Comparable> list : keyLists ) {
Collections.sort(list);
}
for( ArrayList<Comparable> list : keyLists ) {
for( Comparable comp : list ) {
// BUGBUG: unfinished
}
}
return null;
}
*/
public static <T extends Comparable> List<T> makeList(T... args) {
List<T> list = new ArrayList<T>();
for (T arg : args)
{

View File

@ -3,17 +3,43 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.QualityUtils;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 3, 2009
*/
public class QualityScoreCovariate implements Covariate {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
protected boolean USE_ORIGINAL_QUALS;
public QualityScoreCovariate() { // empty constructor is required by CovariateCounterWalker
public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
USE_ORIGINAL_QUALS = false;
}
@ -21,7 +47,7 @@ public class QualityScoreCovariate implements Covariate {
USE_ORIGINAL_QUALS = originalQuals;
}
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases) {
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
byte[] quals = read.getBaseQualities();
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
@ -32,12 +58,16 @@ public class QualityScoreCovariate implements Covariate {
}
}
return ((Integer)((int)quals[offset]));
return ((Integer)((int)quals[offset])); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
}
public Comparable<?> getValue(String str) {
public Comparable getValue(String str) {
return Integer.parseInt( str );
}
public int estimatedNumberOfBins() {
return 40;
}
public String toString() {
return "Reported Quality Score";

View File

@ -2,24 +2,54 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Oct 30, 2009
*/
public class ReadGroupCovariate implements Covariate{
public ReadGroupCovariate() { // empty constructor is required by CovariateCounterWalker
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}
public Comparable<?> getValue(SAMRecord read, int offset, char[] refBases) {
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
return read.getReadGroup().getReadGroupId();
}
public Comparable<?> getValue(String str) {
public Comparable getValue(String str) {
return str;
}
public int estimatedNumberOfBins() {
return 300;
}
public String toString() {
return "Read Group";
}

View File

@ -5,6 +5,31 @@ import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -19,8 +44,9 @@ public class RecalDataManager {
public boolean collapsedTablesCreated;
public NHashMap<Double> dataSumExpectedErrors;
RecalDataManager() {
data = new NHashMap<RecalDatum>();
RecalDataManager( int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.95f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap
collapsedTablesCreated = false;
}
@ -37,17 +63,17 @@ public class RecalDataManager {
// preallocate for use in for loops below
RecalDatum thisDatum;
RecalDatum collapsedDatum;
List<? extends Comparable<?>> key;
ArrayList<Comparable<?>> newKey;
List<? extends Comparable> key;
ArrayList<Comparable> newKey;
Double sumExpectedErrors;
// for every data point in the map
for( Map.Entry<List<? extends Comparable<?>>,RecalDatum> entry : data.entrySet() ) {
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : data.entrySet() ) {
thisDatum = entry.getValue();
key = entry.getKey();
// create dataCollapsedReadGroup, the table where everything except read group has been collapsed
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
collapsedDatum = dataCollapsedReadGroup.get( newKey );
if( collapsedDatum == null ) {
@ -57,7 +83,7 @@ public class RecalDataManager {
collapsedDatum.increment( thisDatum );
}
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with just the read group
sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
@ -69,7 +95,7 @@ public class RecalDataManager {
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
}
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
// create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score
@ -83,7 +109,7 @@ public class RecalDataManager {
// create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // make a new key with the read group ...
newKey.add( key.get(1) ); // and quality score ...
newKey.add( key.get(iii + 2) ); // and the given covariate

View File

@ -5,6 +5,31 @@ import org.broadinstitute.sting.utils.QualityUtils;
import java.util.*;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin

View File

@ -13,6 +13,31 @@ import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -56,14 +81,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
int lineNumber = 0;
boolean foundAllCovariates = false;
dataManager = new RecalDataManager();
int estimatedCapacity = 1;
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
// Read in the data from the csv file and populate the map
out.print( "Reading in the data from input file..." );
logger.info( "Reading in the data from input file..." );
try {
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
@ -80,6 +105,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
try {
Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
} catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) {
@ -100,6 +127,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
foundAllCovariates = true;
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
dataManager = new RecalDataManager( estimatedCapacity );
}
addCSVData(line);
}
@ -110,12 +139,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} catch ( NumberFormatException e ) {
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e);
}
out.println( "...done!" );
logger.info( "...done!" );
if( MODE == RecalibrationMode.SEQUENTIAL ) {
out.print( "Creating collapsed tables for use in sequential calculation..." );
logger.info( "Creating collapsed tables for use in sequential calculation..." );
dataManager.createCollapsedTables( requestedCovariates.size() );
out.println( "...done!" );
logger.info( "...done!" );
}
//System.out.println(dataManager.getCollapsedTable(1));
@ -123,7 +152,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private void addCSVData(String line) {
String[] vals = line.split(",");
ArrayList<Comparable<?>> key = new ArrayList<Comparable<?>>();
ArrayList<Comparable> key = new ArrayList<Comparable>();
Covariate cov; // preallocated for use in for loop below
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
@ -149,53 +178,44 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
byte[] recalQuals = originalQuals.clone();
// For each base in the read
Comparable<?> keyElement; // preallocate for use in for loops below
for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last base because there is no dinuc
ArrayList<Comparable<?>> key = new ArrayList<Comparable<?>>();
boolean badKey = false;
List<Comparable> key = new ArrayList<Comparable>();
for( Covariate covariate : requestedCovariates ) {
keyElement = covariate.getValue( read, iii, refBases );
if ( keyElement != null ) {
key.add( keyElement );
} else {
badKey = true;
}
key.add( covariate.getValue( read, iii, refBases ) );
}
if( !badKey ) {
if( MODE == RecalibrationMode.COMBINATORIAL ) {
RecalDatum datum = dataManager.data.get( key );
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
}
} else if( MODE == RecalibrationMode.SEQUENTIAL ) {
recalQuals[iii] = performSequentialQualityCalculation( key );
} else {
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE );
if( MODE == RecalibrationMode.COMBINATORIAL ) {
RecalDatum datum = dataManager.data.get( key );
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
}
} else if( MODE == RecalibrationMode.SEQUENTIAL ) {
recalQuals[iii] = performSequentialQualityCalculation( key );
} else {
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE );
}
// Do some error checking on the new quality score
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
}
// Do some error checking on the new quality score
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
}
}
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if there is room in the read
if ( read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}
return read;
}
private byte performSequentialQualityCalculation( ArrayList<? extends Comparable<?>> key ) {
private byte performSequentialQualityCalculation( List<? extends Comparable> key ) {
byte qualFromRead = Byte.parseByte(key.get(1).toString());
ArrayList<Comparable<?>> newKey;
ArrayList<Comparable> newKey;
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey );
double globalDeltaQ = 0.0;
@ -206,7 +226,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
newKey.add( key.get(1) ); // quality score
RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey );
@ -219,7 +239,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
double deltaQCovariates = 0.0;
RecalDatum deltaQCovariateDatum;
for( int iii = 2; iii < key.size(); iii++ ) {
newKey = new ArrayList<Comparable<?>>();
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // read group
newKey.add( key.get(1) ); // quality score
newKey.add( key.get(iii) ); // given covariate
@ -236,6 +256,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
throw new StingException( "Illegal base quality score calculated: " + key +
String.format( " => %d + %.2f + %.2f + %.2f = %d", qualFromRead, globalDeltaQ, deltaQReported, deltaQCovariates, newQualityByte ) );
}
//System.out.println("returning: " + newQualityByte);
return newQualityByte;
}