Recalibrator now uses NestedHashMap instead of NHashMap. The keys are now nested hash maps instead of Lists of Comparables. These results in a big speed up (thanks Tim!). There is still a little bit of clean up to do, but everything works now.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2474 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7826e144a1
commit
96c4929b3c
|
|
@ -177,7 +177,7 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
|
|||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
|
||||
|
||||
|
|
|
|||
|
|
@ -86,12 +86,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
@Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.")
|
||||
private int PROCESS_EVERY_NTH_LOCUS = 1;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The output table recalibration csv file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean SORTED_OUTPUT = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -103,7 +97,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.1.3"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.2.0"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -364,7 +358,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
*/
|
||||
private void updateDataFromRead(final SAMRecord read, final int offset, final byte refBase) {
|
||||
|
||||
List<Comparable> key = new ArrayList<Comparable>();
|
||||
List<Object> key = new ArrayList<Object>();
|
||||
|
||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
|
|
@ -372,14 +366,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
RecalDatum datum = dataManager.data.get( key );
|
||||
RecalDatum datum = (RecalDatum) dataManager.data.get( key.toArray() );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
if( SORTED_OUTPUT ) {
|
||||
dataManager.data.sortedPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
}
|
||||
dataManager.data.put( datum, key.toArray() );
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
|
|
@ -459,26 +449,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||
|
||||
|
||||
if( SORTED_OUTPUT && requestedCovariates.size() == 4 )
|
||||
{
|
||||
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted() ) {
|
||||
for( Comparable comp : entry.first ) {
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
recalTableStream.println( entry.second.outputToCSV() );
|
||||
}
|
||||
} else {
|
||||
// For each entry in the data hashmap
|
||||
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getKey() ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.getValue().outputToCSV() );
|
||||
// For each entry in the data hashmap
|
||||
for( Pair<Object[], Object> entry : dataManager.data.entrySetSorted() ) {
|
||||
// For each Covariate in the key
|
||||
for( Object comp : entry.first ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatum)entry.second).outputToCSV() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.NHashMap;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -48,14 +44,10 @@ import net.sf.samtools.SAMReadGroupRecord;
|
|||
|
||||
public class RecalDataManager {
|
||||
|
||||
public NHashMap<RecalDatum> data; // The full dataset
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
private NHashMap<Double> dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
|
||||
private NHashMap<Double> dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
public NestedHashMap data; // The full dataset
|
||||
private NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
|
|
@ -65,76 +57,70 @@ public class RecalDataManager {
|
|||
private static boolean warnUserNoColorSpace = false;
|
||||
|
||||
RecalDataManager() {
|
||||
data = new NHashMap<RecalDatum>();
|
||||
data = new NestedHashMap();
|
||||
}
|
||||
|
||||
RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) {
|
||||
RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
dataCollapsedReadGroup = new NestedHashMap();
|
||||
dataCollapsedQualityScore = new NestedHashMap();
|
||||
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
dataCollapsedByCovariate.add( new NestedHashMap() );
|
||||
}
|
||||
} else {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
|
||||
data = new NestedHashMap();
|
||||
}
|
||||
}
|
||||
|
||||
RecalDataManager( final int estimatedCapacity ) {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f ); // Second arg is the 'loading factor',
|
||||
// a number to monkey around with when optimizing performace of the HashMap
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) {
|
||||
public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
int qualityScore = Integer.parseInt( key.get(1).toString() );
|
||||
ArrayList<Comparable> newKey;
|
||||
int qualityScore = Integer.parseInt( key[1].toString() );
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
newKey = new ArrayList<Comparable>();
|
||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
newKey.add( key.get(0) ); // Make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score
|
||||
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
||||
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
qualityScoreCollapsedKey[1] = key[1]; // and quality score
|
||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
|
||||
dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score ...
|
||||
Comparable theCovariateElement = key.get(iii + 2); // and the given covariate
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
covariateCollapsedKey[1] = key[1]; // and quality score ...
|
||||
Object theCovariateElement = key[iii + 2]; // and the given covariate
|
||||
if( theCovariateElement != null ) {
|
||||
newKey.add( theCovariateElement );
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
covariateCollapsedKey[2] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
|
|
@ -145,43 +131,21 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
* @param numCovariates The number of covariates you have determines the number of tables to create
|
||||
* @param smoothing The smoothing paramter that goes into empirical quality score calculation
|
||||
*/
|
||||
public final void generateEmpiricalQualities( final int numCovariates, final int smoothing ) {
|
||||
public final void generateEmpiricalQualities( final int smoothing ) {
|
||||
|
||||
dataCollapsedReadGroupDouble = new NHashMap<Double>();
|
||||
dataCollapsedQualityScoreDouble = new NHashMap<Double>();
|
||||
dataCollapsedByCovariateDouble = new ArrayList<NHashMap<Double>>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariateDouble.add( new NHashMap<Double>() );
|
||||
for( Pair<Object[],Object> entry : dataCollapsedReadGroup.entrySetSorted() ) {
|
||||
((RecalDatum)entry.second).calcCombinedEmpiricalQuality(smoothing);
|
||||
}
|
||||
|
||||
// Hash the empirical quality scores so we don't have to call Math.log at every base for every read
|
||||
// Looping over the entrySet is really expensive but worth it
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
|
||||
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
for( Pair<Object[],Object> entry : dataCollapsedQualityScore.entrySetSorted() ) {
|
||||
((RecalDatum)entry.second).calcCombinedEmpiricalQuality(smoothing);
|
||||
}
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
|
||||
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
}
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) {
|
||||
dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
for( NestedHashMap map : dataCollapsedByCovariate ) {
|
||||
for( Pair<Object[],Object> entry : map.entrySetSorted() ) {
|
||||
((RecalDatum)entry.second).calcCombinedEmpiricalQuality(smoothing);
|
||||
}
|
||||
}
|
||||
|
||||
dataCollapsedQualityScore.clear();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||
dataCollapsedByCovariate.get(iii).clear();
|
||||
}
|
||||
dataCollapsedByCovariate.clear();
|
||||
dataCollapsedQualityScore = null; // Will never need this table again
|
||||
dataCollapsedByCovariate = null; // Will never need this table again
|
||||
if( data != null ) {
|
||||
data.clear();
|
||||
data = null; // Will never need this table again
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -189,7 +153,7 @@ public class RecalDataManager {
|
|||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||
public final NestedHashMap getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
|
|
@ -199,21 +163,6 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table of emprical quality out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed NHashMap<Double>
|
||||
* @return The desired collapsed NHashMap<Double>
|
||||
*/
|
||||
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroupDouble;
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
* @param read The read to adjust
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ public class RecalDatum {
|
|||
private long numObservations; // number of bases seen in total
|
||||
private long numMismatches; // number of bases seen that didn't match the reference
|
||||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -53,18 +54,21 @@ public class RecalDatum {
|
|||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
estimatedQReported = 0.0;
|
||||
empiricalQuality = 0.0;
|
||||
}
|
||||
|
||||
public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported ) {
|
||||
public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality ) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
estimatedQReported = _estimatedQReported;
|
||||
empiricalQuality = _empiricalQuality;
|
||||
}
|
||||
|
||||
public RecalDatum( final RecalDatum copy ) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
this.estimatedQReported = copy.estimatedQReported;
|
||||
this.empiricalQuality = copy.empiricalQuality;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -115,6 +119,8 @@ public class RecalDatum {
|
|||
}
|
||||
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
public final void calcCombinedEmpiricalQuality(final int smoothing){ this.empiricalQuality = empiricalQualDouble(smoothing); }
|
||||
|
||||
|
||||
public final byte empiricalQualByte( final int smoothing ) {
|
||||
double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
|
|
@ -148,6 +154,10 @@ public class RecalDatum {
|
|||
return estimatedQReported;
|
||||
}
|
||||
|
||||
public final double getEmpiricalQuality() {
|
||||
return empiricalQuality;
|
||||
}
|
||||
|
||||
private double calcExpectedErrors() {
|
||||
return (double)this.numObservations * qualToErrorProb( estimatedQReported );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.1.3"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.2.0"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
|
|
@ -145,7 +145,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
// Read in the data from the csv file and populate the map
|
||||
logger.info( "Reading in the data from input file..." );
|
||||
logger.info( "Reading in the data from input csv file..." );
|
||||
|
||||
try {
|
||||
for ( String line : new xReadLines(new File( RAC.RECAL_FILE )) ) {
|
||||
|
|
@ -203,7 +203,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
|
|
@ -224,7 +224,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||
dataManager.generateEmpiricalQualities( SMOOTHING );
|
||||
logger.info( "...done!" );
|
||||
|
||||
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
|
||||
|
|
@ -273,7 +273,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
ArrayList<Comparable> key = new ArrayList<Comparable>();
|
||||
ArrayList<Object> key = new ArrayList<Object>();
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
|
|
@ -281,9 +281,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
dataManager.addToAllTables( key.toArray(), datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -361,28 +361,31 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
// The global quality shift (over the read group only)
|
||||
collapsedTableKey.add( readGroupKeyElement );
|
||||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( collapsedTableKey );
|
||||
RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( collapsedTableKey.toArray() ));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalDeltaQEmpirical != null ) {
|
||||
double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
|
||||
if( globalRecalDatum != null ) {
|
||||
double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
collapsedTableKey.add( qualityScoreKeyElement );
|
||||
Double deltaQReportedEmpirical = dataManager.getCollapsedDoubleTable(1).get( collapsedTableKey );
|
||||
RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( collapsedTableKey.toArray() ));
|
||||
double deltaQReported = 0.0;
|
||||
if( deltaQReportedEmpirical != null ) {
|
||||
if( qReportedRecalDatum != null ) {
|
||||
double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
Double deltaQCovariateEmpirical;
|
||||
double deltaQCovariateEmpirical;
|
||||
for( int iii = 2; iii < key.size(); iii++ ) {
|
||||
collapsedTableKey.add( key.get(iii) ); // The given covariate
|
||||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( collapsedTableKey );
|
||||
if( deltaQCovariateEmpirical != null ) {
|
||||
RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( collapsedTableKey.toArray() ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
}
|
||||
collapsedTableKey.remove( 2 ); // This new covariate is always added in at position 2 in the collapsedTableKey list
|
||||
|
|
|
|||
|
|
@ -0,0 +1,141 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 29, 2009
|
||||
*/
|
||||
|
||||
public class NestedHashMap{
|
||||
|
||||
private final Map data = new HashMap<Object, Object>();
|
||||
private ArrayList<ArrayList<Comparable>> keyLists; // Used to output the mappings in sorted order
|
||||
|
||||
public Object get( final Object... keys ) {
|
||||
Map map = this.data;
|
||||
for( int iii = 0; iii < keys.length; iii++ ) {
|
||||
if( iii == keys.length - 1 ) {
|
||||
return map.get(keys[iii]);
|
||||
}
|
||||
else {
|
||||
map = (Map) map.get(keys[iii]);
|
||||
if( map == null ) { return null; }
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public void put( final Object value, final Object... keys ) {
|
||||
|
||||
if( keyLists == null ) {
|
||||
keyLists = new ArrayList<ArrayList<Comparable>>();
|
||||
for( Object obj : keys ) {
|
||||
keyLists.add( new ArrayList<Comparable>() );
|
||||
}
|
||||
}
|
||||
|
||||
ArrayList<Comparable> thisList;
|
||||
for( int iii = 0; iii < keys.length; iii++ ) {
|
||||
thisList = keyLists.get( iii );
|
||||
if( thisList == null ) {
|
||||
thisList = new ArrayList<Comparable>();
|
||||
}
|
||||
if( !thisList.contains( (Comparable)keys[iii] ) ) {
|
||||
thisList.add( (Comparable)keys[iii] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Map map = this.data;
|
||||
for( int iii = 0; iii < keys.length; iii++ ) {
|
||||
if( iii == keys.length - 1 ) {
|
||||
map.put(keys[iii], value);
|
||||
}
|
||||
else {
|
||||
Map tmp = (Map) map.get(keys[iii]);
|
||||
if( tmp == null ) {
|
||||
tmp = new HashMap();
|
||||
map.put(keys[iii], tmp);
|
||||
}
|
||||
map = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public ArrayList<Pair<Object[], Object>> entrySetSorted() {
|
||||
|
||||
ArrayList<Pair<Object[], Object>> theSet = new ArrayList<Pair<Object[], Object>>();
|
||||
|
||||
for( ArrayList<Comparable> list : keyLists ) {
|
||||
Collections.sort(list);
|
||||
}
|
||||
|
||||
int[] keyIndex = new int[ keyLists.size() ];
|
||||
int[] maxIndex = new int[ keyLists.size() ];
|
||||
for( int iii = 0; iii < keyLists.size(); iii++ ) {
|
||||
keyIndex[iii] = 0;
|
||||
maxIndex[iii] = keyLists.get(iii).size();
|
||||
}
|
||||
|
||||
// Try all the possible keys in sorted order, add them to the output set if they are in the hashMap
|
||||
// BUGBUG: make this more efficient
|
||||
boolean triedAllKeys = false;
|
||||
ArrayList<Object> newKey = null;
|
||||
while( !triedAllKeys ) {
|
||||
newKey = new ArrayList<Object>();
|
||||
for( int iii = 0; iii < keyLists.size(); iii++ ) {
|
||||
newKey.add(keyLists.get(iii).get(keyIndex[iii]));
|
||||
}
|
||||
Object value = this.get( newKey.toArray() );
|
||||
if( value!= null ) {
|
||||
theSet.add(new Pair<Object[], Object>( newKey.toArray(), value ) );
|
||||
}
|
||||
|
||||
// Increment the keyIndex
|
||||
keyIndex[keyLists.size() - 1]++;
|
||||
for( int iii = keyLists.size() - 1; iii >= 0; iii-- ) {
|
||||
if( keyIndex[iii] >= maxIndex[iii] ) { // Carry it forward
|
||||
keyIndex[iii] = 0;
|
||||
if( iii > 0 ) {
|
||||
keyIndex[iii-1]++;
|
||||
} else {
|
||||
triedAllKeys = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return theSet;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -36,7 +36,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -cov QualityScoreCovariate" +
|
||||
" -cov CycleCovariate" +
|
||||
" -cov DinucCovariate" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
|
|
@ -96,7 +95,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -cov QualityScoreCovariate" +
|
||||
" -cov CycleCovariate" +
|
||||
" -cov DinucCovariate" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
|
|
@ -125,7 +123,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -cov CycleCovariate" +
|
||||
" -cov DinucCovariate" +
|
||||
" --default_platform illumina" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
|
|
@ -166,7 +163,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesNoIndex() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "906e5a08401722cc9a5528d2fd20ea6a" );
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "850f2a2d5bc94cc22b3b038b424252c6" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -179,7 +176,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -I " + bam +
|
||||
" -cov ReadGroupCovariate" +
|
||||
" -cov QualityScoreCovariate" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode DO_NOTHING" +
|
||||
" -recalFile %s" +
|
||||
" -U",
|
||||
|
|
|
|||
Loading…
Reference in New Issue