Merge branch 'master' of ssh://gsa1.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-02-11 23:02:19 -05:00
commit c8c06c7753
27 changed files with 166 additions and 180 deletions

View File

@ -107,7 +107,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
// which active regions in the work queue are now safe to process
if( !locusView.hasNext() ) {
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
@ -135,7 +135,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
}
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
@ -190,7 +190,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker
final M x = walker.map( activeRegion, null );
return walker.reduce( x, sum );
}

View File

@ -5,14 +5,12 @@ import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -77,7 +75,7 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
public abstract double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
// Map over the ActiveRegion
public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker);
public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker);
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();

View File

@ -22,8 +22,8 @@ import java.util.*;
* User: chartl
* Date: 9/14/11
* Time: 12:24 PM
* To change this template use File | Settings | File Templates.
*/
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
private MendelianViolation mendelianViolation = null;

View File

@ -98,4 +98,9 @@ public class ContextCovariate implements StandardCovariate {
return s;
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return str;
}
}

View File

@ -53,6 +53,7 @@ public interface Covariate {
*/
public CovariateValues getValues(GATKSAMRecord read);
public Comparable getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
}
interface RequiredCovariate extends Covariate {}

View File

@ -15,18 +15,18 @@ public class CovariateKeySet {
private int nextCovariateIndex;
private final String mismatchesCovariateName = "M";
private final String insertionsCovariateName = "I";
private final String deletionsCovariateName = "D";
public final static String mismatchesCovariateName = "M";
public final static String insertionsCovariateName = "I";
public final static String deletionsCovariateName = "D";
public CovariateKeySet(int readLength, int numberOfCovariates) {
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
initializeCovariateKeySet(this.mismatchesKeySet, this.mismatchesCovariateName);
initializeCovariateKeySet(this.insertionsKeySet, this.insertionsCovariateName);
initializeCovariateKeySet(this.deletionsKeySet, this.deletionsCovariateName);
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName);
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName);
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName);
this.nextCovariateIndex = 0;
}

View File

@ -196,4 +196,9 @@ public class CycleCovariate implements StandardCovariate {
return new CovariateValues(cycles, cycles, cycles);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return Integer.parseInt(str);
}
}

View File

@ -39,39 +39,35 @@ import java.util.Arrays;
public class QualityScoreCovariate implements RequiredCovariate {
private byte defaultMismatchesQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
private byte defaultInsertionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
private byte defaultDeletionsQuality; // walker parameter. Must be > 0 to be used, otherwise we use the quality from the read.
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
defaultMismatchesQuality = RAC.MISMATCHES_DEFAULT_QUALITY;
defaultInsertionsQuality = RAC.INSERTIONS_DEFAULT_QUALITY;
defaultDeletionsQuality = RAC.DELETIONS_DEFAULT_QUALITY;
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int readLength = read.getReadLength();
Byte [] mismatches = new Byte[readLength];
Byte [] insertions = new Byte[readLength];
Byte [] deletions = new Byte[readLength];
Integer [] mismatches = new Integer[readLength];
Integer [] insertions = new Integer[readLength];
Integer [] deletions = new Integer[readLength];
byte [] baseQualities = read.getBaseQualities();
byte [] baseInsertionQualities = read.getBaseInsertionQualities();
byte [] baseDeletionQualities = read.getBaseDeletionQualities();
if (defaultMismatchesQuality >= 0)
Arrays.fill(mismatches, defaultMismatchesQuality); // if the user decides to override the base qualities in the read, use the flat value
else {
for (int i=0; i<baseQualities.length; i++)
mismatches[i] = baseQualities[i];
for (int i=0; i<baseQualities.length; i++) {
mismatches[i] = (int) baseQualities[i];
insertions[i] = (int) baseInsertionQualities[i];
deletions[i] = (int) baseDeletionQualities[i];
}
Arrays.fill(insertions, defaultInsertionsQuality); // Some day in the future when base insertion and base deletion quals exist the samtools API will
Arrays.fill( deletions, defaultDeletionsQuality); // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat value (parameter)
return new CovariateValues(mismatches, insertions, deletions);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return Integer.parseInt(str);
}
}

View File

@ -52,6 +52,12 @@ public class ReadGroupCovariate implements RequiredCovariate {
Arrays.fill(readGroups, readGroupId);
return new CovariateValues(readGroups, readGroups, readGroups);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Comparable getValue(final String str) {
return str;
}
}

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -52,20 +53,23 @@ import java.util.Map;
*/
public class RecalDataManager {
public final NestedHashMap nestedHashMap; // The full dataset
private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private final ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
public final NestedHashMap nestedHashMap; // The full dataset
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private final HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNullPlatform = false;
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
public enum BaseRecalibrationType {
BASE_SUBSTITUTION,
BASE_INSERTION,
BASE_DELETION
}
public enum SOLID_RECAL_MODE {
/**
@ -109,13 +113,18 @@ public class RecalDataManager {
}
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
nestedHashMap = null;
dataCollapsedReadGroup = new NestedHashMap();
dataCollapsedQualityScore = new NestedHashMap();
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add(new NestedHashMap());
dataCollapsedReadGroup = new HashMap<BaseRecalibrationType, NestedHashMap>();
dataCollapsedQualityScore = new HashMap<BaseRecalibrationType, NestedHashMap>();
dataCollapsedByCovariate = new HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>>();
for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.get(errorModel).add(new NestedHashMap());
}
}
}
else {
@ -137,7 +146,7 @@ public class RecalDataManager {
* @param fullDatum The RecalDatum which is the data for this mapping
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
*/
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) {
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) {
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
//data.put(key, thisDatum); // add the mapping to the main table
@ -151,9 +160,9 @@ public class RecalDataManager {
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey);
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey);
dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey);
}
else {
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
@ -163,24 +172,24 @@ public class RecalDataManager {
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
qualityScoreCollapsedKey[1] = key[1]; // and quality score
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey);
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
}
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for (int iii = 0; iii < dataCollapsedByCovariate.size(); iii++) {
for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) {
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
covariateCollapsedKey[1] = key[1]; // and quality score ...
final Object theCovariateElement = key[iii + 2]; // and the given covariate
if (theCovariateElement != null) {
covariateCollapsedKey[2] = theCovariateElement;
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey);
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey);
if (collapsedDatum == null) {
dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
}
else {
collapsedDatum.increment(fullDatum);
@ -198,11 +207,13 @@ public class RecalDataManager {
*/
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
for (NestedHashMap map : dataCollapsedByCovariate) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
checkForSingletons(map.data);
for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual);
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual);
for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) {
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
checkForSingletons(map.data);
}
}
}
@ -241,15 +252,15 @@ public class RecalDataManager {
* @param covariate Which covariate indexes the desired collapsed HashMap
* @return The desired collapsed HashMap
*/
public final NestedHashMap getCollapsedTable(final int covariate) {
public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) {
if (covariate == 0) {
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
}
else if (covariate == 1) {
return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
return dataCollapsedQualityScore.get(errorModel); // Table where everything except read group and quality score has been collapsed
}
else {
return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
return dataCollapsedByCovariate.get(errorModel).get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
}
}
@ -260,7 +271,7 @@ public class RecalDataManager {
* @param RAC The list of shared command line arguments
*/
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
GATKSAMReadGroupRecord readGroup = read.getReadGroup();
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM);

View File

@ -56,7 +56,7 @@ public class ContextCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
byte[] bases = read.getReadBases();
for (int i = 0; i < read.getReadLength(); i++)
comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i));

View File

@ -378,7 +378,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
RecalDataManager.parseColorSpace(gatkRead);
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates));
}
// Skip this position if base quality is zero

View File

@ -43,7 +43,7 @@ public interface Covariate {
public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public void getValues(GATKSAMRecord read, Comparable[] comparable, BaseRecalibration.BaseRecalibrationType modelType);
public void getValues(GATKSAMRecord read, Comparable[] comparable);
//Takes an array of size (at least) read.getReadLength() and fills it with covariate
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
//read-specific calculations to be done just once rather than for each offset.

View File

@ -66,7 +66,7 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
//-----------------------------
// Illumina, Solid, PacBio, and Complete Genomics

View File

@ -66,7 +66,7 @@ public class DinucCovariate implements StandardCovariate {
* Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
*/
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
final HashMap<Integer, Dinuc> dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
final int readLength = read.getReadLength();
final boolean negativeStrand = read.getReadNegativeStrandFlag();

View File

@ -82,7 +82,7 @@ public class GCContentCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -95,7 +95,7 @@ public class HomopolymerCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -55,7 +55,7 @@ public class MappingQualityCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read); // BUGBUG: this can be optimized
}

View File

@ -65,7 +65,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -55,7 +55,7 @@ public class PositionCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -62,7 +62,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
for (int iii = 0; iii < read.getReadLength(); iii++) {
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
}

View File

@ -46,16 +46,10 @@ public class QualityScoreCovariate implements RequiredCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
if (modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION) {
byte[] baseQualities = read.getBaseQualities();
for (int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
}
}
else { // model == BASE_INSERTION || model == BASE_DELETION
Arrays.fill(comparable, 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
byte[] baseQualities = read.getBaseQualities();
for (int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
}
}

View File

@ -44,7 +44,7 @@ public class ReadGroupCovariate implements RequiredCovariate {
}
@Override
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
public void getValues(final GATKSAMRecord read, final Comparable[] comparable) {
final String readGroupId = read.getReadGroup().getReadGroupId();
for (int i = 0; i < read.getReadLength(); i++) {
comparable[i] = readGroupId;

View File

@ -63,7 +63,6 @@ public class RecalDataManager {
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE {
@ -604,7 +603,7 @@ public class RecalDataManager {
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*/
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType) {
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates) {
//compute all covariates for this read
final int numRequestedCovariates = requestedCovariates.size();
final int readLength = gatkRead.getReadLength();
@ -613,7 +612,7 @@ public class RecalDataManager {
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder);
for (int j = 0; j < readLength; j++)
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
}

View File

@ -405,7 +405,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates);
// For each base in the read
for (int offset = 0; offset < read.getReadLength(); offset++) {

View File

@ -25,14 +25,12 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.text.XReadLines;
@ -52,19 +50,13 @@ import java.util.regex.Pattern;
public class BaseRecalibration {
public enum BaseRecalibrationType {
BASE_SUBSTITUTION,
BASE_INSERTION,
BASE_DELETION
}
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
public static final String EOF_MARKER = "EOF";
private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here?
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values.
public BaseRecalibration( final File RECAL_FILE ) {
// Get a list of all available covariates
@ -89,7 +81,7 @@ public class BaseRecalibration {
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
} else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(",");
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
@ -160,7 +152,7 @@ public class BaseRecalibration {
final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
}
@ -172,39 +164,63 @@ public class BaseRecalibration {
cov = requestedCovariates.get( iii );
key[iii] = cov.getValue( vals[iii] );
}
final String modelString = vals[iii++];
final RecalDataManager.BaseRecalibrationType errorModel = ( modelString.equals(CovariateKeySet.mismatchesCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION :
( modelString.equals(CovariateKeySet.insertionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_INSERTION :
( modelString.equals(CovariateKeySet.deletionsCovariateName) ? RecalDataManager.BaseRecalibrationType.BASE_DELETION : null ) ) );
// Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
}
public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals, final BaseRecalibrationType modelType ) {
public void recalibrateRead( final GATKSAMRecord read ) {
final byte[] recalQuals = originalQuals.clone();
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates(read, requestedCovariates, modelType);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = qualityScore;
}
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
RecalDataManager.computeCovariates(read, requestedCovariates);
final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read );
for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) {
final byte[] originalQuals = ( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? read.getBaseQualities() :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? read.getBaseDeletionQualities() :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? read.getBaseDeletionQualities() : null ) ) );
final byte[] recalQuals = originalQuals.clone();
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
return recalQuals;
final Object[] fullCovariateKey =
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ? covariateKeySet.getMismatchesKeySet(offset) :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_INSERTION ? covariateKeySet.getInsertionsKeySet(offset) :
( errorModel == RecalDataManager.BaseRecalibrationType.BASE_DELETION ? covariateKeySet.getDeletionsKeySet(offset) : null ) ) );
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if( qualityScore == null ) {
qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = qualityScore;
}
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
switch (errorModel) {
case BASE_SUBSTITUTION:
read.setBaseQualities( recalQuals );
break;
case BASE_INSERTION:
read.setAttribute( GATKSAMRecord.BQSR_BASE_INSERTION_QUALITIES, recalQuals );
break;
case BASE_DELETION:
read.setAttribute( GATKSAMRecord.BQSR_BASE_DELETION_QUALITIES, recalQuals );
break;
default:
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
}
}
}
/**
@ -222,7 +238,7 @@ public class BaseRecalibration {
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation( final Object... key ) {
private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) {
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
@ -231,7 +247,7 @@ public class BaseRecalibration {
// The global quality shift (over the read group only)
readGroupCollapsedKey[0] = key[0];
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey ));
double globalDeltaQ = 0.0;
if( globalRecalDatum != null ) {
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
@ -242,7 +258,7 @@ public class BaseRecalibration {
// The shift in quality between reported and empirical
qualityScoreCollapsedKey[0] = key[0];
qualityScoreCollapsedKey[1] = key[1];
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey ));
double deltaQReported = 0.0;
if( qReportedRecalDatum != null ) {
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
@ -256,7 +272,7 @@ public class BaseRecalibration {
covariateCollapsedKey[1] = key[1];
for( int iii = 2; iii < key.length; iii++ ) {
covariateCollapsedKey[2] = key[iii]; // The given covariate
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey ));
if( covariateRecalDatum != null ) {
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
@ -265,18 +281,6 @@ public class BaseRecalibration {
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
//}
//else {
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
//}
//return newQualityByte;
}
/**
@ -291,5 +295,4 @@ public class BaseRecalibration {
}
}
}
}

View File

@ -54,7 +54,6 @@ public class GATKSAMRecord extends BAMRecord {
// Base Quality Score Recalibrator specific attribute tags
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI";
public static final String BQSR_BASE_DELETION_QUALITIES = "BD";
public static final String BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG = "BR";
// the SAMRecord data we're caching
private String mReadString = null;
@ -163,27 +162,6 @@ public class GATKSAMRecord extends BAMRecord {
return super.equals(o);
}
@Override
public byte[] getBaseQualities() {
return super.getBaseQualities();
/*
if( getAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG ) != null ) {
return super.getBaseQualities();
} else {
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
final byte[] quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, super.getBaseQualities() );
setBaseQualities(quals);
setAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG, true );
return quals;
} else { // just use the qualities that are in the read since we don't have the sufficient information to recalibrate on the fly
return super.getBaseQualities();
}
}
*/
}
/**
* Accessors for base insertion and base deletion quality scores
*/
@ -191,13 +169,8 @@ public class GATKSAMRecord extends BAMRecord {
byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES );
if( quals == null ) {
quals = new byte[getBaseQualities().length];
Arrays.fill(quals, (byte) 45); // allow for differing default values between BaseInsertions and BaseDeletions
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
// else give default values which are flat Q45
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_INSERTION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
}
// add the qual array to the read so that we don't have to do the recalibration work again
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals );
}
return quals;
@ -207,13 +180,8 @@ public class GATKSAMRecord extends BAMRecord {
byte[] quals = getByteArrayAttribute( BQSR_BASE_DELETION_QUALITIES );
if( quals == null ) {
quals = new byte[getBaseQualities().length];
Arrays.fill(quals, (byte) 45);
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
// else give default values which are flat Q45
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_DELETION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
}
// add the qual array to the read so that we don't have to do the recalibration work again
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
setAttribute( BQSR_BASE_DELETION_QUALITIES, quals );
}
return quals;