Recalibrator makes use of the new GATKSAMRecord wrapper and now no longer has to hash the SAMRecord. Covariate's getValue method signature has changed to take the SAMRecord instead of the ReadHashDatum. ReadHashDatum removed completely.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2185 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-30 19:59:17 +00:00
parent e9a8156cfb
commit 817e2cb8c5
14 changed files with 134 additions and 227 deletions

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -37,7 +39,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
public interface Covariate {
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
public Comparable getValue( ReadHashDatum readDatum, int offset ); // Used to pick out the covariate's value from attributes of the read
public Comparable getValue( SAMRecord read, int offset ); // Used to pick out the covariate's value from attributes of the read
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public int estimatedNumberOfBins(); // Used to estimate the amount space required for the full data HashMap
}

View File

@ -96,13 +96,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
/////////////////////////////
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
private int sizeOfReadDatumHashMap = 0;
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.0.8"; // Major version, minor version, and build number
private static final String versionString = "v2.0.9"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -228,7 +226,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
estimatedCapacity = 300 * 40 * 200;
}
dataManager = new RecalDataManager( estimatedCapacity );
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
}
@ -273,7 +270,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
byte refBase;
byte prevBase;
byte[] colorSpaceQuals;
ReadHashDatum readDatum;
final int numReads = reads.size();
// For each read at this locus
@ -281,49 +277,35 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
read = reads.get(iii);
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
// Try to pull the read out of the read IdentityHashMap
readDatum = readDatumHashMap.get( read );
if( readDatum == null ) {
// If the HashMap of read objects has grown too large then throw out the (mostly stale) reads
if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger?
readDatumHashMap.clear();
sizeOfReadDatumHashMap = 0;
}
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
readDatum = ReadHashDatum.parseSAMRecord( read, RAC );
readDatumHashMap.put( read, readDatum );
sizeOfReadDatumHashMap++;
}
RecalDataManager.parseSAMRecord(read, RAC);
// Skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < readDatum.length - 1 ) {
if( offset < read.getReadLength() - 1 ) {
// Skip if base quality is zero
if( readDatum.quals[offset] > 0 ) {
if( read.getBaseQualities()[offset] > 0 ) {
refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset - 1];
prevBase = read.getReadBases()[offset - 1];
// DinucCovariate is responsible for getting the complement bases if needed
if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset + 1];
if( read.getReadNegativeStrandFlag() ) {
prevBase = read.getReadBases()[offset + 1];
}
// Skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(read.getReadBases()[offset]) ) ) {
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
colorSpaceQuals = null;
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
}
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
{
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( readDatum, offset, refBase );
updateDataFromRead( read, offset, refBase );
}
} else {
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
@ -331,9 +313,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
}
}
} else { // At the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again
readDatumHashMap.remove( read );
sizeOfReadDatumHashMap--;
}
}
}
@ -405,17 +384,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation.
* @param readDatum The ReadHashDatum holding all the important properties of this read
* @param read The SAMRecord holding all the data for this read
* @param offset The offset in the read for this locus
* @param refBase The reference base at this locus
*/
private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) {
private void updateDataFromRead(final SAMRecord read, final int offset, final byte refBase) {
List<Comparable> key = new ArrayList<Comparable>();
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( readDatum, offset ) );
key.add( covariate.getValue( read, offset ) );
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
@ -430,7 +409,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// Need the bases to determine whether or not we have a mismatch
byte base = readDatum.bases[offset];
byte base = read.getReadBases()[offset];
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
@ -56,16 +57,16 @@ public class CycleCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
//-----------------------------
// ILLUMINA
//-----------------------------
if( readDatum.platform.equalsIgnoreCase( "ILLUMINA" ) ) {
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) ) {
int cycle = offset;
if( readDatum.isNegStrand ) {
cycle = readDatum.bases.length - (offset + 1);
if( read.getReadNegativeStrandFlag() ) {
cycle = read.getReadLength() - (offset + 1);
}
return cycle;
}
@ -74,14 +75,14 @@ public class CycleCovariate implements Covariate {
// 454
//-----------------------------
else if( readDatum.platform.contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
int cycle = 0;
//BUGBUG: should reverse directions on negative strand reads!
byte prevBase = readDatum.bases[0];
byte prevBase = read.getReadBases()[0];
for( int iii = 1; iii <= offset; iii++ ) {
if(readDatum.bases[iii] != prevBase) { // This base doesn't match the previous one so it is a new cycle
if(read.getReadBases()[iii] != prevBase) { // This base doesn't match the previous one so it is a new cycle
cycle++;
prevBase = readDatum.bases[iii];
prevBase = read.getReadBases()[iii];
}
}
return cycle;
@ -91,7 +92,7 @@ public class CycleCovariate implements Covariate {
// SOLID
//-----------------------------
else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) {
else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) {
// The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
//BUGBUG: should reverse directions on negative strand reads!
return offset / 5; // integer division
@ -104,18 +105,19 @@ public class CycleCovariate implements Covariate {
else { // Platform is unrecognized so revert to the default platform but warn the user first
if( !warnedUserBadPlatform ) {
if( defaultPlatform != null) { // the user set a default platform
Utils.warnUser( "Platform string (" + readDatum.platform + ") unrecognized in CycleCovariate. " +
Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " +
"Reverting to " + defaultPlatform + " definition of machine cycle." );
} else { // the user did not set a default platform
Utils.warnUser( "Platform string (" + readDatum.platform + ") unrecognized in CycleCovariate. " +
Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " +
"Reverting to Illumina definition of machine cycle. Users may set the default platform using the --default_platform <String> argument." );
defaultPlatform = "Illumina";
}
warnedUserBadPlatform = true;
}
ReadHashDatum correctedReadDatum = new ReadHashDatum( readDatum );
correctedReadDatum.platform = defaultPlatform;
return getValue( correctedReadDatum, offset ); // a recursive call
//ReadHashDatum correctedReadDatum = new ReadHashDatum( readDatum );
//correctedReadDatum.platform = defaultPlatform;
return 0; // BUGBUG: broken at the moment
//return getValue( correctedReadDatum, offset ); // a recursive call
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import java.util.HashMap;
import org.broadinstitute.sting.utils.BaseUtils;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
@ -55,17 +56,17 @@ public class DinucCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
byte base;
byte prevBase;
// If this is a negative strand read then we need to reverse the direction for our previous base
if( readDatum.isNegStrand ) {
base = (byte)BaseUtils.simpleComplement( (char)readDatum.bases[offset] );
prevBase = (byte)BaseUtils.simpleComplement( (char)readDatum.bases[offset + 1] );
if( read.getReadNegativeStrandFlag() ) {
base = (byte)BaseUtils.simpleComplement( (char)(read.getReadBases()[offset]) );
prevBase = (byte)BaseUtils.simpleComplement( (char)(read.getReadBases()[offset + 1]) );
} else {
base = readDatum.bases[offset];
prevBase = readDatum.bases[offset - 1];
base = read.getReadBases()[offset];
prevBase = read.getReadBases()[offset - 1];
}
//char[] charArray = {(char)prevBase, (char)base};
//return new String( charArray ); // This is an expensive call

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -40,9 +42,9 @@ public class MappingQualityCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
return readDatum.mappingQuality;
return read.getMappingQuality();
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -44,15 +46,15 @@ public class MinimumNQSCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
// Loop over the list of base quality scores in the window and find the minimum
int minQual = readDatum.quals[offset];
int minQual = read.getBaseQualities()[offset];
int minIndex = Math.max(offset - windowReach, 0);
int maxIndex = Math.min(offset + windowReach, readDatum.quals.length - 1);
int maxIndex = Math.min(offset + windowReach, read.getBaseQualities().length - 1);
for ( int iii = minIndex; iii < maxIndex; iii++ ) {
if( readDatum.quals[iii] < minQual ) {
minQual = readDatum.quals[iii];
if( read.getBaseQualities()[iii] < minQual ) {
minQual = read.getBaseQualities()[iii];
}
}
return minQual;

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -41,10 +43,10 @@ public class PositionCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
int cycle = offset;
if( readDatum.isNegStrand ) {
cycle = readDatum.bases.length - (offset + 1);
if( read.getReadNegativeStrandFlag() ) {
cycle = read.getReadLength() - (offset + 1);
}
return cycle;
}

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -42,8 +44,8 @@ public class PrimerRoundCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) {
return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
} else {
return 1; // nothing to do here because it is always the same

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -40,9 +42,9 @@ public class QualityScoreCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
public final Comparable getValue( final SAMRecord read, final int offset ) {
return (int)(readDatum.quals[offset]);
return (int)(read.getBaseQualities()[offset]);
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -42,8 +44,8 @@ public class ReadGroupCovariate implements Covariate{
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
return readDatum.readGroup;
public final Comparable getValue( final SAMRecord read, final int offset ) {
return read.getReadGroup().getReadGroupId();
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker

View File

@ -1,133 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import edu.mit.broad.picard.illumina.parser.IlluminaUtil;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Nov 14, 2009
*/
public class ReadHashDatum {
public String readGroup;
public String platform;
public byte[] quals;
public byte[] bases;
public boolean isNegStrand;
public int mappingQuality;
public int length;
public Integer tile;
private static boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
public ReadHashDatum(String _readGroup, String _platform, byte[] _quals, byte[] _bases, boolean _isNegStrand,
int _mappingQuality, int _length, Integer _tile) {
readGroup = _readGroup;
platform = _platform;
quals = _quals;
bases = _bases;
isNegStrand = _isNegStrand;
mappingQuality = _mappingQuality;
length = _length;
tile = _tile;
}
public ReadHashDatum(ReadHashDatum that) {
this.readGroup = that.readGroup;
this.platform = that.platform;
this.quals = that.quals;
this.bases = that.bases;
this.isNegStrand = that.isNegStrand;
this.mappingQuality = that.mappingQuality;
this.length = that.length;
this.tile = that.tile;
}
/**
* Pulls the important bits out of a SAMRecord and stuffs them into a leaner Object to be used by the Covariates and cached by the Recalibration walkers
* @param read The SAMRecord whose data we want to get at
* @param RAC The collection of arguments that are shared between CovariateCounterWalker and TableRecalibrationWalker
* @return A new ReadHashDatum holding all the important parts of the read
*/
public static ReadHashDatum parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) {
// Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about
byte[] quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
quals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
final byte[] bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'.
final boolean isNegStrand = read.getReadNegativeStrandFlag();
final SAMReadGroupRecord readGroup = read.getReadGroup();
String readGroupId;
String platform;
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
if( readGroup == null ) {
if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) {
Utils.warnUser("The input .bam file contains reads with no read group. " +
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName() );
warnUserNullReadGroup = true;
}
// There is no readGroup so defaulting to these values
readGroupId = RAC.DEFAULT_READ_GROUP;
platform = RAC.DEFAULT_PLATFORM;
} else {
readGroupId = readGroup.getReadGroupId();
platform = readGroup.getPlatform();
}
final int mappingQuality = read.getMappingQuality();
final int length = bases.length;
if( RAC.FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user
readGroupId = RAC.FORCE_READ_GROUP;
}
if( RAC.FORCE_PLATFORM != null ) {
platform = RAC.FORCE_PLATFORM;
}
Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName());
return new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length, tile );
}
}

View File

@ -1,9 +1,14 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -51,6 +56,7 @@ public class RecalDataManager {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
private static boolean warnUserNullReadGroup = false;
RecalDataManager() {
data = new NHashMap<RecalDatum>();
@ -212,4 +218,49 @@ public class RecalDataManager {
return dataCollapsedByCovariateDouble.get( covariate - 2 ); // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
}
}
/**
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
* @param read The read to adjust
* @param RAC The list of shared command line arguments
*/
public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC) {
// Check if we need to use the original quality scores instead
if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) );
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
SAMReadGroupRecord readGroup = read.getReadGroup();
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
if( readGroup == null ) {
if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) {
Utils.warnUser("The input .bam file contains reads with no read group. " +
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName() );
warnUserNullReadGroup = true;
}
// There is no readGroup so defaulting to these values
readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP );
readGroup.setPlatform( RAC.DEFAULT_PLATFORM );
((GATKSAMRecord)read).setReadGroup( readGroup );
}
if( RAC.FORCE_READ_GROUP != null && !read.getReadGroup().getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user
String oldPlatform = readGroup.getPlatform();
readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP );
readGroup.setPlatform( oldPlatform );
((GATKSAMRecord)read).setReadGroup( readGroup );
}
if( RAC.FORCE_PLATFORM != null && !read.getReadGroup().getPlatform().equals(RAC.FORCE_PLATFORM)) {
read.getReadGroup().setPlatform( RAC.FORCE_PLATFORM );
}
}
}

View File

@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.0.6"; // Major version, minor version, and build number
private static final String versionString = "v2.0.7"; // Major version, minor version, and build number
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
//---------------------------------------------------------------------------------------------------------------
@ -301,27 +301,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code
RecalDataManager.parseSAMRecord(read, RAC);
byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
originalQuals = QualityUtils.fastqToPhred((String)obj);
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
}
}
byte[] recalQuals = originalQuals.clone();
// Fill out the readDatum with the useful information from the read
ReadHashDatum readDatum = ReadHashDatum.parseSAMRecord( read, RAC );
int startPos = 1;
int stopPos = readDatum.length;
int stopPos = read.getReadLength();
if( readDatum.isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
if( read.getReadNegativeStrandFlag() ) { // DinucCovariate is responsible for getting the complement base if needed
startPos = 0;
stopPos = readDatum.length - 1;
stopPos = read.getReadLength() - 1;
}
// For each base in the read
@ -329,7 +319,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// Get the covariate values which make up the key
for( Covariate covariate : requestedCovariates ) {
fullCovariateKey.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here
fullCovariateKey.add( covariate.getValue( read, iii ) ); // offset is zero based so passing iii is correct here
}
recalQuals[iii] = performSequentialQualityCalculation( fullCovariateKey );
@ -339,7 +329,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord;
import edu.mit.broad.picard.illumina.parser.IlluminaUtil;
/**
* @author alecw@broadinstitute.org
@ -35,11 +37,12 @@ public class TileCovariate implements Covariate {
}
// Used to pick out the covariate's value from attributes of the read
public Comparable getValue(final ReadHashDatum readDatum, final int offset) {
if (readDatum.tile == null) {
public Comparable getValue(final SAMRecord read, final int offset) {
Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName());
if (tile == null) {
throw new StingException("Tile number not defined for read");
}
return readDatum.tile;
return tile;
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker