diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java index a9a63c400..809ab7e9a 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java @@ -41,5 +41,5 @@ public interface Covariate { public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); //used to pick out the value from attributes of the read public Comparable getValue(String str); // used to get value from input file - public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap + public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index eaa0dcbef..fb7274604 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -6,10 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.PackageUtils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.*; import java.io.PrintStream; import java.io.FileNotFoundException; @@ -70,9 +67,11 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score") private int MIN_MAPPING_QUALITY = 1; @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - private boolean USE_ORIGINAL_QUALS = false; // BUGBUG need to pass this value to the proper covariates + private boolean USE_ORIGINAL_QUALS = false; @Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false) private String PLATFORM = "SLX"; + @Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) + private int WINDOW_SIZE = 1; @Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file") private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file") @@ -83,7 +82,7 @@ public class CovariateCounterWalker extends LocusWalker { private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested //private HashMap readGroupHashMap; // A hash map that hashes the read object itself into the read group name - // This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive + // This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file @@ -119,7 +118,7 @@ public class CovariateCounterWalker extends LocusWalker { requestedCovariates = new ArrayList(); if( validateOldRecalibrator ) { requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator + requestedCovariates.add( new CycleCovariate( PLATFORM ) ); // unfortunately the ordering here is different to match the output of the old recalibrator requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add( new DinucCovariate() ); //estimatedCapacity = 300 * 200 * 40 * 16; @@ -139,6 +138,10 @@ public class CovariateCounterWalker extends LocusWalker { try { // Now that we've found a matching class, try to instantiate it Covariate covariate = (Covariate)covClass.newInstance(); + // some covariates need parameters (user supplied command line arguments) passed to them + if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); } + else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); } + else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } requestedCovariates.add( covariate ); //estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity } catch ( InstantiationException e ) { @@ -157,7 +160,7 @@ public class CovariateCounterWalker extends LocusWalker { logger.info( "Note: Using default set of covariates because none were specified." ); requestedCovariates.add( new ReadGroupCovariate() ); requestedCovariates.add( new QualityScoreCovariate() ); - requestedCovariates.add( new CycleCovariate() ); + requestedCovariates.add( new CycleCovariate( PLATFORM ) ); requestedCovariates.add( new DinucCovariate() ); //estimatedCapacity = 300 * 40 * 200 * 16; } @@ -222,6 +225,16 @@ public class CovariateCounterWalker extends LocusWalker { if( offset > 0 && offset < read.getReadLength() - 1) { quals = read.getBaseQualities(); + // Check if we need to use the original quality scores instead + if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + quals = QualityUtils.fastqToPhred((String)obj); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + } + // skip if base quality is zero if( quals[offset] > 0 ) { bases = read.getReadString().toCharArray(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java index 492bbd557..7562a03db 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java @@ -36,11 +36,10 @@ import net.sf.samtools.SAMRecord; * * The Cycle covariate. * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) - * Yet to be implemented: - * - For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read + * For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read * For example, for the read: AAACCCCGAAATTTTTTT * the cycle would be 111222234445555555 - * - For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round + * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ public class CycleCovariate implements Covariate { @@ -64,9 +63,18 @@ public class CycleCovariate implements Covariate { } return cycle; } else if( platform.equalsIgnoreCase( "454" ) ) { - return 0; //BUGBUG: not yet implemented + int cycle = 1; + char prevBase = bases[0]; + for( int iii = 1; iii <= offset; iii++ ) { + if(bases[iii] != prevBase) { // this base doesn't match the previous one so it is a new cycle + cycle++; + prevBase = bases[iii]; + } + } + return cycle; } else if( platform.equalsIgnoreCase( "SOLID" ) ) { - return 0; //BUGBUG: not yet implemented + // the ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + return (offset / 5) + 1; // integer division } else { throw new StingException( "Requested platform (" + platform + ") not supported in CycleCovariate." ); } @@ -74,7 +82,7 @@ public class CycleCovariate implements Covariate { } public final Comparable getValue(final String str) { - return (int)Integer.parseInt( str ); + return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing } public final int estimatedNumberOfBins() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java index 8d71dd831..340330bdb 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java @@ -43,11 +43,11 @@ public class MappingQualityCovariate implements Covariate { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) { - return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code + return read.getMappingQuality(); } public final Comparable getValue(final String str) { - return Integer.parseInt( str ); + return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing } public final int estimatedNumberOfBins() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java index f34cecfbc..67a0f9f7c 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java @@ -34,48 +34,38 @@ import org.broadinstitute.sting.utils.QualityUtils; * Date: Nov 4, 2009 * * The Minimum Neighborhood Quality Score covariate, originally described by Chris Hartl. - * This covariate is the minimum base quality score along the length of the read. - * BUGBUG: I don't think this is what was intended by Chris. Covariate interface must change to implement the real covariate. + * This covariate is the minimum base quality score in the read in a small window around the current base. */ public class MinimumNQSCovariate implements Covariate { - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; - private boolean USE_ORIGINAL_QUALS; + private int windowReach; // how far in each direction from the current base to look public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker - USE_ORIGINAL_QUALS = false; + windowReach = 1; // window size = 3 was the best covariate according to Chris's analysis } - public MinimumNQSCovariate(final boolean originalQuals) { - USE_ORIGINAL_QUALS = originalQuals; + public MinimumNQSCovariate(final int windowSize) { + windowReach = windowSize / 2; // integer division } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) { - //BUGBUG: Original qualities - //if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - // Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); - // if ( obj instanceof String ) - // quals = QualityUtils.fastqToPhred((String)obj); - // else { - // throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - // } - //} - - // Loop over the list of qualities and find the minimum - Integer minQual = (int)(quals[0]); - for ( int qual : quals ) { - if( qual < minQual ) { - minQual = qual; + // Loop over the list of base quality scores in the window and find the minimum + int minQual = quals[offset]; + int minIndex = Math.max(offset - windowReach, 0); + int maxIndex = Math.min(offset + windowReach, quals.length - 1); + for ( int iii = minIndex; iii < maxIndex; iii++ ) { + if( quals[iii] < minQual ) { + minQual = quals[iii]; } } return minQual; } public final Comparable getValue(final String str) { - return Integer.parseInt( str ); + return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing } public final int estimatedNumberOfBins() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java new file mode 100755 index 000000000..da5ac7345 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java @@ -0,0 +1,79 @@ +package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; + +import org.broadinstitute.sting.utils.StingException; + +import net.sf.samtools.SAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 13, 2009 + * + * The Primer Round covariate. + * For Solexa and 454 this is the same value of the length of the read. + * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + */ + +public class PrimerRoundCovariate implements Covariate { + + private String platform; + + public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + platform = "SLX"; + } + + public PrimerRoundCovariate(final String _platform) { + platform = _platform; + } + + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + if( platform.equalsIgnoreCase( "SLX" ) ) { + return 1; // nothing to do here because it is always the same + } else if( platform.equalsIgnoreCase( "454" ) ) { + return 1; // nothing to do here because it is always the same + } else if( platform.equalsIgnoreCase( "SOLID" ) ) { + return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + } else { + throw new StingException( "Requested platform (" + platform + ") not supported in PrimerRoundCovariate." ); + } + + } + + public final Comparable getValue(final String str) { + return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing + } + + public final int estimatedNumberOfBins() { + return 5; + } + + public String toString() { + return "Primer Round"; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java index 9e0f4fcfe..6a44e3d53 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java @@ -38,36 +38,17 @@ import org.broadinstitute.sting.utils.QualityUtils; public class QualityScoreCovariate implements Covariate { - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; - protected boolean USE_ORIGINAL_QUALS; - public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker - USE_ORIGINAL_QUALS = false; } - public QualityScoreCovariate(final boolean originalQuals) { - USE_ORIGINAL_QUALS = originalQuals; - } - - public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) { - //BUGBUG: original qualities - //if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - // Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); - // if ( obj instanceof String ) - // quals = QualityUtils.fastqToPhred((String)obj); - // else { - // throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - // } - //} - - //Integer returnQual = (int)quals[offset]; - return (int)quals[offset]; // returning an Integer Object (as opposed to primitive int) is required so that return values from the two getValue methods hash to the same code + return (int)quals[offset]; } public final Comparable getValue(final String str) { - return (int)Integer.parseInt( str ); + return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing } public final int estimatedNumberOfBins() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java index ca1ed97b5..3a315eec5 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java @@ -44,6 +44,8 @@ public class RecalDataManager { private boolean collapsedTablesCreated; public NHashMap dataSumExpectedErrors; + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; + RecalDataManager() { data = new NHashMap(); collapsedTablesCreated = false; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index 932819a0d..fd8a5b13c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -63,7 +63,11 @@ public class TableRecalibrationWalker extends ReadWalker(); @@ -128,8 +131,12 @@ public class TableRecalibrationWalker extends ReadWalker