From 817e2cb8c5a0b52f00466542bf811be46e78f00a Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 30 Nov 2009 19:59:17 +0000 Subject: [PATCH] Recalibrator makes use of the new GATKSAMRecord wrapper and now no longer has to hash the SAMRecord. Covariate's getValue method signature has changed to take the SAMRecord instead of the ReadHashDatum. ReadHashDatum removed completely. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2185 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/recalibration/Covariate.java | 4 +- .../recalibration/CovariateCounterWalker.java | 49 ++----- .../walkers/recalibration/CycleCovariate.java | 30 ++-- .../walkers/recalibration/DinucCovariate.java | 13 +- .../MappingQualityCovariate.java | 6 +- .../recalibration/MinimumNQSCovariate.java | 12 +- .../recalibration/PositionCovariate.java | 8 +- .../recalibration/PrimerRoundCovariate.java | 6 +- .../recalibration/QualityScoreCovariate.java | 6 +- .../recalibration/ReadGroupCovariate.java | 6 +- .../walkers/recalibration/ReadHashDatum.java | 133 ------------------ .../recalibration/RecalDataManager.java | 51 +++++++ .../TableRecalibrationWalker.java | 28 ++-- .../walkers/recalibration/TileCovariate.java | 9 +- 14 files changed, 134 insertions(+), 227 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index 717bc7c64..cc298ee78 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -37,7 +39,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public interface Covariate { public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers - public Comparable getValue( ReadHashDatum readDatum, int offset ); // Used to pick out the covariate's value from attributes of the read + public Comparable getValue( SAMRecord read, int offset ); // Used to pick out the covariate's value from attributes of the read public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker public int estimatedNumberOfBins(); // Used to estimate the amount space required for the full data HashMap } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index d8fba4d30..0770c358e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -96,13 +96,11 @@ public class CovariateCounterWalker extends LocusWalker { ///////////////////////////// private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested - private IdentityHashMap readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes. - private int sizeOfReadDatumHashMap = 0; private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.0.8"; // Major version, minor version, and build number + private static final String versionString = "v2.0.9"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -228,7 +226,6 @@ public class CovariateCounterWalker extends LocusWalker { estimatedCapacity = 300 * 40 * 200; } dataManager = new RecalDataManager( estimatedCapacity ); - readDatumHashMap = new IdentityHashMap(); } @@ -273,7 +270,6 @@ public class CovariateCounterWalker extends LocusWalker { byte refBase; byte prevBase; byte[] colorSpaceQuals; - ReadHashDatum readDatum; final int numReads = reads.size(); // For each read at this locus @@ -281,49 +277,35 @@ public class CovariateCounterWalker extends LocusWalker { read = reads.get(iii); offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct - // Try to pull the read out of the read IdentityHashMap - readDatum = readDatumHashMap.get( read ); - if( readDatum == null ) { - - // If the HashMap of read objects has grown too large then throw out the (mostly stale) reads - if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger? - readDatumHashMap.clear(); - sizeOfReadDatumHashMap = 0; - } - - // This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again - readDatum = ReadHashDatum.parseSAMRecord( read, RAC ); - readDatumHashMap.put( read, readDatum ); - sizeOfReadDatumHashMap++; - } + RecalDataManager.parseSAMRecord(read, RAC); // Skip first and last base because there is no dinuc // BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests. if( offset > 0 ) { - if( offset < readDatum.length - 1 ) { + if( offset < read.getReadLength() - 1 ) { // Skip if base quality is zero - if( readDatum.quals[offset] > 0 ) { + if( read.getBaseQualities()[offset] > 0 ) { refBase = (byte)ref.getBase(); - prevBase = readDatum.bases[offset - 1]; + prevBase = read.getReadBases()[offset - 1]; // DinucCovariate is responsible for getting the complement bases if needed - if( readDatum.isNegStrand ) { - prevBase = readDatum.bases[offset + 1]; + if( read.getReadNegativeStrandFlag() ) { + prevBase = read.getReadBases()[offset + 1]; } // Skip if this base or the previous one was an 'N' or etc. - if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) { + if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(read.getReadBases()[offset]) ) ) { // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them colorSpaceQuals = null; - if( readDatum.platform.equalsIgnoreCase("SOLID") ) { + if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) { colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); } if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet { // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( readDatum, offset, refBase ); + updateDataFromRead( read, offset, refBase ); } } else { if( RAC.VALIDATE_OLD_RECALIBRATOR ) { @@ -331,9 +313,6 @@ public class CovariateCounterWalker extends LocusWalker { } } } - } else { // At the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again - readDatumHashMap.remove( read ); - sizeOfReadDatumHashMap--; } } } @@ -405,17 +384,17 @@ public class CovariateCounterWalker extends LocusWalker { * adding one to the number of observations and potentially one to the number of mismatches * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls * because pulling things out of the SAMRecord is an expensive operation. - * @param readDatum The ReadHashDatum holding all the important properties of this read + * @param read The SAMRecord holding all the data for this read * @param offset The offset in the read for this locus * @param refBase The reference base at this locus */ - private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) { + private void updateDataFromRead(final SAMRecord read, final int offset, final byte refBase) { List key = new ArrayList(); // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( readDatum, offset ) ); + key.add( covariate.getValue( read, offset ) ); } // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap @@ -430,7 +409,7 @@ public class CovariateCounterWalker extends LocusWalker { } // Need the bases to determine whether or not we have a mismatch - byte base = readDatum.bases[offset]; + byte base = read.getReadBases()[offset]; // Add one to the number of observations and potentially one to the number of mismatches datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index d359bf41d..cc78e91d2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; +import net.sf.samtools.SAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -56,16 +57,16 @@ public class CycleCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { //----------------------------- // ILLUMINA //----------------------------- - if( readDatum.platform.equalsIgnoreCase( "ILLUMINA" ) ) { + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) ) { int cycle = offset; - if( readDatum.isNegStrand ) { - cycle = readDatum.bases.length - (offset + 1); + if( read.getReadNegativeStrandFlag() ) { + cycle = read.getReadLength() - (offset + 1); } return cycle; } @@ -74,14 +75,14 @@ public class CycleCovariate implements Covariate { // 454 //----------------------------- - else if( readDatum.platform.contains( "454" ) ) { // Some bams have "LS454" and others have just "454" + else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" int cycle = 0; //BUGBUG: should reverse directions on negative strand reads! - byte prevBase = readDatum.bases[0]; + byte prevBase = read.getReadBases()[0]; for( int iii = 1; iii <= offset; iii++ ) { - if(readDatum.bases[iii] != prevBase) { // This base doesn't match the previous one so it is a new cycle + if(read.getReadBases()[iii] != prevBase) { // This base doesn't match the previous one so it is a new cycle cycle++; - prevBase = readDatum.bases[iii]; + prevBase = read.getReadBases()[iii]; } } return cycle; @@ -91,7 +92,7 @@ public class CycleCovariate implements Covariate { // SOLID //----------------------------- - else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) { + else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf //BUGBUG: should reverse directions on negative strand reads! return offset / 5; // integer division @@ -104,18 +105,19 @@ public class CycleCovariate implements Covariate { else { // Platform is unrecognized so revert to the default platform but warn the user first if( !warnedUserBadPlatform ) { if( defaultPlatform != null) { // the user set a default platform - Utils.warnUser( "Platform string (" + readDatum.platform + ") unrecognized in CycleCovariate. " + + Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + "Reverting to " + defaultPlatform + " definition of machine cycle." ); } else { // the user did not set a default platform - Utils.warnUser( "Platform string (" + readDatum.platform + ") unrecognized in CycleCovariate. " + + Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + "Reverting to Illumina definition of machine cycle. Users may set the default platform using the --default_platform argument." ); defaultPlatform = "Illumina"; } warnedUserBadPlatform = true; } - ReadHashDatum correctedReadDatum = new ReadHashDatum( readDatum ); - correctedReadDatum.platform = defaultPlatform; - return getValue( correctedReadDatum, offset ); // a recursive call + //ReadHashDatum correctedReadDatum = new ReadHashDatum( readDatum ); + //correctedReadDatum.platform = defaultPlatform; + return 0; // BUGBUG: broken at the moment + //return getValue( correctedReadDatum, offset ); // a recursive call } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index 636f5eb43..d6371b6c1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import java.util.HashMap; import org.broadinstitute.sting.utils.BaseUtils; +import net.sf.samtools.SAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -55,17 +56,17 @@ public class DinucCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { byte base; byte prevBase; // If this is a negative strand read then we need to reverse the direction for our previous base - if( readDatum.isNegStrand ) { - base = (byte)BaseUtils.simpleComplement( (char)readDatum.bases[offset] ); - prevBase = (byte)BaseUtils.simpleComplement( (char)readDatum.bases[offset + 1] ); + if( read.getReadNegativeStrandFlag() ) { + base = (byte)BaseUtils.simpleComplement( (char)(read.getReadBases()[offset]) ); + prevBase = (byte)BaseUtils.simpleComplement( (char)(read.getReadBases()[offset + 1]) ); } else { - base = readDatum.bases[offset]; - prevBase = readDatum.bases[offset - 1]; + base = read.getReadBases()[offset]; + prevBase = read.getReadBases()[offset - 1]; } //char[] charArray = {(char)prevBase, (char)base}; //return new String( charArray ); // This is an expensive call diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index ae6fc552f..882785dde 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -40,9 +42,9 @@ public class MappingQualityCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { - return readDatum.mappingQuality; + return read.getMappingQuality(); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index d0f9699f1..ef7855c39 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -44,15 +46,15 @@ public class MinimumNQSCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { // Loop over the list of base quality scores in the window and find the minimum - int minQual = readDatum.quals[offset]; + int minQual = read.getBaseQualities()[offset]; int minIndex = Math.max(offset - windowReach, 0); - int maxIndex = Math.min(offset + windowReach, readDatum.quals.length - 1); + int maxIndex = Math.min(offset + windowReach, read.getBaseQualities().length - 1); for ( int iii = minIndex; iii < maxIndex; iii++ ) { - if( readDatum.quals[iii] < minQual ) { - minQual = readDatum.quals[iii]; + if( read.getBaseQualities()[iii] < minQual ) { + minQual = read.getBaseQualities()[iii]; } } return minQual; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 6f4d6f787..17301460a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -41,10 +43,10 @@ public class PositionCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { int cycle = offset; - if( readDatum.isNegStrand ) { - cycle = readDatum.bases.length - (offset + 1); + if( read.getReadNegativeStrandFlag() ) { + cycle = read.getReadLength() - (offset + 1); } return cycle; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 5445deb18..329ba75e6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -42,8 +44,8 @@ public class PrimerRoundCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { - if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf } else { return 1; // nothing to do here because it is always the same diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index ab083fefc..0b4a9e285 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -40,9 +42,9 @@ public class QualityScoreCovariate implements Covariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + public final Comparable getValue( final SAMRecord read, final int offset ) { - return (int)(readDatum.quals[offset]); + return (int)(read.getBaseQualities()[offset]); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index 400571864..fa847bcd1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -42,8 +44,8 @@ public class ReadGroupCovariate implements Covariate{ } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { - return readDatum.readGroup; + public final Comparable getValue( final SAMRecord read, final int offset ) { + return read.getReadGroup().getReadGroupId(); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java deleted file mode 100755 index 3af1339c1..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java +++ /dev/null @@ -1,133 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.recalibration; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; -import edu.mit.broad.picard.illumina.parser.IlluminaUtil; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 14, 2009 - */ - -public class ReadHashDatum { - public String readGroup; - public String platform; - public byte[] quals; - public byte[] bases; - public boolean isNegStrand; - public int mappingQuality; - public int length; - public Integer tile; - private static boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet? - - public ReadHashDatum(String _readGroup, String _platform, byte[] _quals, byte[] _bases, boolean _isNegStrand, - int _mappingQuality, int _length, Integer _tile) { - readGroup = _readGroup; - platform = _platform; - quals = _quals; - bases = _bases; - isNegStrand = _isNegStrand; - mappingQuality = _mappingQuality; - length = _length; - tile = _tile; - } - - public ReadHashDatum(ReadHashDatum that) { - this.readGroup = that.readGroup; - this.platform = that.platform; - this.quals = that.quals; - this.bases = that.bases; - this.isNegStrand = that.isNegStrand; - this.mappingQuality = that.mappingQuality; - this.length = that.length; - this.tile = that.tile; - } - - /** - * Pulls the important bits out of a SAMRecord and stuffs them into a leaner Object to be used by the Covariates and cached by the Recalibration walkers - * @param read The SAMRecord whose data we want to get at - * @param RAC The collection of arguments that are shared between CovariateCounterWalker and TableRecalibrationWalker - * @return A new ReadHashDatum holding all the important parts of the read - */ - public static ReadHashDatum parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { - - // Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about - byte[] quals = read.getBaseQualities(); - - // Check if we need to use the original quality scores instead - if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } - - final byte[] bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. - final boolean isNegStrand = read.getReadNegativeStrandFlag(); - final SAMReadGroupRecord readGroup = read.getReadGroup(); - String readGroupId; - String platform; - - // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments - if( readGroup == null ) { - if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { - Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); - warnUserNullReadGroup = true; - } - // There is no readGroup so defaulting to these values - readGroupId = RAC.DEFAULT_READ_GROUP; - platform = RAC.DEFAULT_PLATFORM; - } else { - readGroupId = readGroup.getReadGroupId(); - platform = readGroup.getPlatform(); - } - - final int mappingQuality = read.getMappingQuality(); - final int length = bases.length; - - if( RAC.FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user - readGroupId = RAC.FORCE_READ_GROUP; - } - - if( RAC.FORCE_PLATFORM != null ) { - platform = RAC.FORCE_PLATFORM; - } - - Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName()); - - return new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length, tile ); - - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index f8ac343f6..a1a69dd61 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -1,9 +1,14 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -51,6 +56,7 @@ public class RecalDataManager { public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams + private static boolean warnUserNullReadGroup = false; RecalDataManager() { data = new NHashMap(); @@ -212,4 +218,49 @@ public class RecalDataManager { return dataCollapsedByCovariateDouble.get( covariate - 2 ); // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed } } + + /** + * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * @param read The read to adjust + * @param RAC The list of shared command line arguments + */ + public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC) { + + // Check if we need to use the original quality scores instead + if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) ); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + } + + SAMReadGroupRecord readGroup = read.getReadGroup(); + + // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments + if( readGroup == null ) { + if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { + Utils.warnUser("The input .bam file contains reads with no read group. " + + "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName() ); + warnUserNullReadGroup = true; + } + // There is no readGroup so defaulting to these values + readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); + readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); + ((GATKSAMRecord)read).setReadGroup( readGroup ); + } + + if( RAC.FORCE_READ_GROUP != null && !read.getReadGroup().getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user + String oldPlatform = readGroup.getPlatform(); + readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP ); + readGroup.setPlatform( oldPlatform ); + ((GATKSAMRecord)read).setReadGroup( readGroup ); + } + + if( RAC.FORCE_PLATFORM != null && !read.getReadGroup().getPlatform().equals(RAC.FORCE_PLATFORM)) { + read.getReadGroup().setPlatform( RAC.FORCE_PLATFORM ); + } + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 68f9815bf..399412ae5 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker