From 6e6f0f10e1073ed4adf6aa05631f3a65c08503ee Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 6 Feb 2012 12:31:20 -0500 Subject: [PATCH] BaseQualityScoreRecalibration walker (bqsr v2) first commit includes * Adding the context covariate standard in both modes (including old CountCovariates) with parameters * Updating all covariates and modules to use GATKSAMRecord throughout the code. * BQSR now processes indels in the pileup (but doesn't do anything with them yet) --- .../recalibration/ContextCovariate.java | 27 +- .../gatk/walkers/recalibration/Covariate.java | 25 +- .../walkers/recalibration/CycleCovariate.java | 134 +++-- .../walkers/recalibration/DinucCovariate.java | 57 +- .../recalibration/GCContentCovariate.java | 33 +- .../recalibration/HomopolymerCovariate.java | 24 +- .../MappingQualityCovariate.java | 16 +- .../recalibration/MinimumNQSCovariate.java | 17 +- .../recalibration/PositionCovariate.java | 15 +- .../recalibration/PrimerRoundCovariate.java | 24 +- .../recalibration/QualityScoreCovariate.java | 19 +- .../recalibration/ReadGroupCovariate.java | 12 +- .../recalibration/RecalDataManager.java | 537 ++++++++++-------- .../RecalibrationArgumentCollection.java | 25 +- 14 files changed, 547 insertions(+), 418 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java index 0edd5d03b..875782fdc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ContextCovariate.java @@ -25,8 +25,9 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; @@ -38,28 +39,32 @@ import java.util.Arrays; public class ContextCovariate implements ExperimentalCovariate { - final int CONTEXT_SIZE = 8; - String allN = ""; + private int CONTEXT_SIZE; + private String allN = ""; // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { - for( int iii = 0; iii < CONTEXT_SIZE; iii++ ) { + public void initialize(final RecalibrationArgumentCollection RAC) { + CONTEXT_SIZE = RAC.CONTEXT_SIZE; + + if (CONTEXT_SIZE <= 0) + throw new UserException("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead"); + + // initialize allN given the size of the context + for (int i = 0; i < CONTEXT_SIZE; i++) allN += "N"; - } } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { byte[] bases = read.getReadBases(); - for(int i = 0; i < read.getReadLength(); i++) { - comparable[i] = ( i-CONTEXT_SIZE < 0 ? allN : new String(Arrays.copyOfRange(bases,i-CONTEXT_SIZE,i)) ); - } + for (int i = 0; i < read.getReadLength(); i++) + comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i)); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { + public final Comparable getValue(final String str) { return str; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index 2e32dbb8c..e4edb8ca6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -39,19 +39,18 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; */ public interface Covariate { - public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers - public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker - public void getValues( SAMRecord read, Comparable[] comparable, BaseRecalibration.BaseRecalibrationType modelType ); - //Takes an array of size (at least) read.getReadLength() and fills it with covariate - //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows - //read-specific calculations to be done just once rather than for each offset. + public void initialize(RecalibrationArgumentCollection RAC); // Initialize any member variables using the command-line arguments passed to the walkers + + public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker + + public void getValues(GATKSAMRecord read, Comparable[] comparable, BaseRecalibration.BaseRecalibrationType modelType); + //Takes an array of size (at least) read.getReadLength() and fills it with covariate + //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows + //read-specific calculations to be done just once rather than for each offset. } -interface RequiredCovariate extends Covariate { -} +interface RequiredCovariate extends Covariate {} -interface StandardCovariate extends Covariate { -} +interface StandardCovariate extends Covariate {} -interface ExperimentalCovariate extends Covariate { -} +interface ExperimentalCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 00490d898..4244af7d1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -40,69 +39,69 @@ import java.util.EnumSet; * Date: Oct 30, 2009 * * The Cycle covariate. - * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) - * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle - * For example, for the read: AAACCCCGAAATTTTTACTG - * the cycle would be 11111111222333333344 - * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round + * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) + * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle + * For example, for the read: AAACCCCGAAATTTTTACTG + * the cycle would be 11111111222333333344 + * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ public class CycleCovariate implements StandardCovariate { private final static EnumSet DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); - private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); + private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { - if( RAC.DEFAULT_PLATFORM != null ) { - if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) || - RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) { + public void initialize(final RecalibrationArgumentCollection RAC) { + if (RAC.DEFAULT_PLATFORM != null) { + if (RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SLX") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ILLUMINA") || + RAC.DEFAULT_PLATFORM.contains("454") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SOLID") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ABI_SOLID")) { // nothing to do - } else { - throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid"); + } + else { + throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform. Implemented options are illumina, 454, and solid"); } } } // Used to pick out the covariate's value from attributes of the read @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { //----------------------------- // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - final NGSPlatform ngsPlatform = ((GATKSAMRecord)read).getNGSPlatform(); - if( DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform) ) { + final NGSPlatform ngsPlatform = read.getNGSPlatform(); + if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { final int init; final int increment; - if( !read.getReadNegativeStrandFlag() ) { + if (!read.getReadNegativeStrandFlag()) { // Differentiate between first and second of pair. // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. // Therefore the cycle covariate must differentiate between first and second of pair reads. // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because // the current sequential model would consider the effects independently instead of jointly. - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { //second of pair, positive strand init = -1; increment = -1; } - else - { + else { //first of pair, positive strand init = 1; increment = 1; } - } else { - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + } + else { + if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) { //second of pair, negative strand init = -read.getReadLength(); increment = 1; } - else - { + else { //first of pair, negative strand init = read.getReadLength(); increment = -1; @@ -110,7 +109,7 @@ public class CycleCovariate implements StandardCovariate { } int cycle = init; - for(int i = 0; i < read.getReadLength(); i++) { + for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = cycle; cycle += increment; } @@ -119,7 +118,7 @@ public class CycleCovariate implements StandardCovariate { //----------------------------- // 454 and Ion Torrent //----------------------------- - else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) { + else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { final int readLength = read.getReadLength(); final byte[] bases = read.getReadBases(); @@ -136,39 +135,78 @@ public class CycleCovariate implements StandardCovariate { // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if( !read.getReadNegativeStrandFlag() ) { // Forward direction + if (!read.getReadNegativeStrandFlag()) { // Forward direction int iii = 0; - while( iii < readLength ) - { - while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; } - while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; } - if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; } - if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; } + while (iii < readLength) { + while (iii < readLength && bases[iii] == (byte) 'T') { + comparable[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'A') { + comparable[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'C') { + comparable[iii] = cycle; + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'G') { + comparable[iii] = cycle; + iii++; + } + if (iii < readLength) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { + comparable[iii] = cycle; + iii++; + } } - } else { // Negative direction - int iii = readLength-1; - while( iii >= 0 ) - { - while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; } - while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; } - if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; } - if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; } + } + else { // Negative direction + int iii = readLength - 1; + while (iii >= 0) { + while (iii >= 0 && bases[iii] == (byte) 'T') { + comparable[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'A') { + comparable[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'C') { + comparable[iii] = cycle; + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'G') { + comparable[iii] = cycle; + iii--; + } + if (iii >= 0) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { + comparable[iii] = cycle; + iii--; + } } } } - else { + else { throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index e60b1f795..2fa1b33ca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -1,8 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.HashMap; @@ -43,30 +43,30 @@ import java.util.HashMap; public class DinucCovariate implements StandardCovariate { - private static final byte NO_CALL = (byte)'N'; + private static final byte NO_CALL = (byte) 'N'; private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL); private HashMap dinucHashMap; // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { - final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' }; + public void initialize(final RecalibrationArgumentCollection RAC) { + final byte[] BASES = {(byte) 'A', (byte) 'C', (byte) 'G', (byte) 'T'}; dinucHashMap = new HashMap(); - for( byte byte1 : BASES ) { - for( byte byte2: BASES ) { - dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow + for (byte byte1 : BASES) { + for (byte byte2 : BASES) { + dinucHashMap.put(Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2)); // This might seem silly, but Strings are too slow } } // Add the "no dinuc" entry too - dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC ); + dinucHashMap.put(Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC); } /** * Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read. */ @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { final HashMap dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap final int readLength = read.getReadLength(); final boolean negativeStrand = read.getReadNegativeStrandFlag(); @@ -76,37 +76,38 @@ public class DinucCovariate implements StandardCovariate { int offset = 0; // If this is a negative strand read then we need to reverse the direction for our previous base - if(negativeStrand) { + if (negativeStrand) { bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place } comparable[0] = NO_DINUC; // No dinuc at the beginning of the read prevBase = bases[0]; offset++; - while(offset < readLength) { - // Note: We are using the previous base in the read, not the - // previous base in the reference. This is done in part to be consistent with unmapped reads. - base = bases[offset]; - if( BaseUtils.isRegularBase( prevBase ) ) { - comparable[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) ); - } else { - comparable[offset] = NO_DINUC; - } + while (offset < readLength) { + // Note: We are using the previous base in the read, not the + // previous base in the reference. This is done in part to be consistent with unmapped reads. + base = bases[offset]; + if (BaseUtils.isRegularBase(prevBase)) { + comparable[offset] = dinucHashMapRef.get(Dinuc.hashBytes(prevBase, base)); + } + else { + comparable[offset] = NO_DINUC; + } - offset++; - prevBase = base; + offset++; + prevBase = base; } - if(negativeStrand) { - reverse( comparable ); + if (negativeStrand) { + reverse(comparable); } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { + public final Comparable getValue(final String str) { byte[] bytes = str.getBytes(); - final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( bytes[0], bytes[1] ) ); - if( returnDinuc.compareTo(NO_DINUC) == 0 ) { + final Dinuc returnDinuc = dinucHashMap.get(Dinuc.hashBytes(bytes[0], bytes[1])); + if (returnDinuc.compareTo(NO_DINUC) == 0) { return null; } return returnDinuc; @@ -115,11 +116,11 @@ public class DinucCovariate implements StandardCovariate { /** * Reverses the given array in place. * - * @param array + * @param array any array */ private static void reverse(final Comparable[] array) { final int arrayLength = array.length; - for(int l = 0, r = arrayLength - 1; l < r; l++, r--) { + for (int l = 0, r = arrayLength - 1; l < r; l++, r--) { final Comparable temp = array[l]; array[l] = array[r]; array[r] = temp; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java index e4ff415fe..7b209ae5c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2010 The Broad Institute @@ -39,55 +40,57 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; public class GCContentCovariate implements ExperimentalCovariate { - int numBack = 7; + private int numBack = 7; // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { numBack = RAC.HOMOPOLYMER_NBACK; } // Used to pick out the covariate's value from attributes of the read - private final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { // ATTGCCCCGTAAAAAAAGAGAA // 0000123456654321001122 - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) ) { + if (read.getReadGroup().getPlatform().equalsIgnoreCase("ILLUMINA") || read.getReadGroup().getPlatform().equalsIgnoreCase("SLX")) { int numGC = 0; - int startPos = 0; - int stopPos = 0; + int startPos; + int stopPos; final byte[] bases = read.getReadBases(); - if( !read.getReadNegativeStrandFlag() ) { // Forward direction + if (!read.getReadNegativeStrandFlag()) { // Forward direction startPos = Math.max(offset - numBack, 0); stopPos = Math.max(offset - 1, 0); - } else { // Negative direction + } + else { // Negative direction startPos = Math.min(offset + 2, bases.length); stopPos = Math.min(offset + numBack + 1, bases.length); } - for( int iii = startPos; iii < stopPos; iii++ ) { - if( bases[iii] == (byte)'G' || bases[iii] == (byte)'C' ) { + for (int iii = startPos; iii < stopPos; iii++) { + if (bases[iii] == (byte) 'G' || bases[iii] == (byte) 'C') { numGC++; } } return numGC; - } else { // This effect is specific to the Illumina platform + } + else { // This effect is specific to the Illumina platform return -1; } } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java index 24cb98a8d..fd67edc3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -41,16 +42,16 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; public class HomopolymerCovariate implements ExperimentalCovariate { - int numBack = 7; + private int numBack; // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { numBack = RAC.HOMOPOLYMER_NBACK; } // Used to pick out the covariate's value from attributes of the read - private final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { // This block of code is for if you don't want to only count consecutive bases // ATTGCCCCGTAAAAAAAAATA @@ -77,13 +78,14 @@ public class HomopolymerCovariate implements ExperimentalCovariate { int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read final byte[] bases = read.getReadBases(); int iii = offset; - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - while( iii <= bases.length-2 && bases[iii] == bases[iii+1] && numAgree < numBack ) { + if (!read.getReadNegativeStrandFlag()) { // Forward direction + while (iii <= bases.length - 2 && bases[iii] == bases[iii + 1] && numAgree < numBack) { numAgree++; iii++; } - } else { // Negative direction - while( iii >= 1 && bases[iii] == bases[iii-1] && numAgree < numBack ) { + } + else { // Negative direction + while (iii >= 1 && bases[iii] == bases[iii - 1] && numAgree < numBack) { numAgree++; iii--; } @@ -93,15 +95,15 @@ public class HomopolymerCovariate implements ExperimentalCovariate { } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index ec5b357a4..e22049890 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -40,24 +40,24 @@ public class MappingQualityCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { } // Used to pick out the covariate's value from attributes of the read - private final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final GATKSAMRecord read) { return read.getMappingQuality(); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - for(int iii = 0; iii < read.getReadLength(); iii++) { - comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + for (int iii = 0; iii < read.getReadLength(); iii++) { + comparable[iii] = getValue(read); // BUGBUG: this can be optimized } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index 21fd14e0c..1dfb915b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -43,20 +44,20 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { windowReach = RAC.WINDOW_SIZE / 2; // integer division } // Used to pick out the covariate's value from attributes of the read - private final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { // Loop over the list of base quality scores in the window and find the minimum final byte[] quals = read.getBaseQualities(); int minQual = quals[offset]; final int minIndex = Math.max(offset - windowReach, 0); final int maxIndex = Math.min(offset + windowReach, quals.length - 1); - for ( int iii = minIndex; iii < maxIndex; iii++ ) { - if( quals[iii] < minQual ) { + for (int iii = minIndex; iii < maxIndex; iii++) { + if (quals[iii] < minQual) { minQual = quals[iii]; } } @@ -64,15 +65,15 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 5c410ce5f..fbd1efc47 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -41,28 +42,28 @@ public class PositionCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { } // Used to pick out the covariate's value from attributes of the read - private final Comparable getValue( final SAMRecord read, final int offset ) { + private Comparable getValue(final SAMRecord read, final int offset) { int cycle = offset; - if( read.getReadNegativeStrandFlag() ) { + if (read.getReadNegativeStrandFlag()) { cycle = read.getReadLength() - (offset + 1); } return cycle; } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index e6aa44226..8dfa11884 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -34,41 +35,42 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; * Date: Nov 13, 2009 * * The Primer Round covariate. - * For Solexa and 454 this is the same value of the length of the read. - * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + * For Solexa and 454 this is the same value of the length of the read. + * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf */ public class PrimerRoundCovariate implements ExperimentalCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { } // Used to pick out the covariate's value from attributes of the read - private final Comparable getValue( final SAMRecord read, final int offset ) { - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" ) ) { + private Comparable getValue(final SAMRecord read, final int offset) { + if (read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || read.getReadGroup().getPlatform().equalsIgnoreCase("ABI_SOLID")) { int pos = offset; - if( read.getReadNegativeStrandFlag() ) { + if (read.getReadNegativeStrandFlag()) { pos = read.getReadLength() - (offset + 1); } return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - } else { + } + else { return 1; // nothing to do here because it is always the same } } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - for(int iii = 0; iii < read.getReadLength(); iii++) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + for (int iii = 0; iii < read.getReadLength(); iii++) { comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index f85b52350..1ed4a6fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; @@ -42,25 +42,26 @@ public class QualityScoreCovariate implements RequiredCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { - if( modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { + if (modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION) { byte[] baseQualities = read.getBaseQualities(); - for(int i = 0; i < read.getReadLength(); i++) { + for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = (int) baseQualities[i]; } - } else { // model == BASE_INSERTION || model == BASE_DELETION + } + else { // model == BASE_INSERTION || model == BASE_DELETION Arrays.fill(comparable, 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return Integer.parseInt(str); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index e27077128..27e1d8263 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -38,24 +38,22 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; public class ReadGroupCovariate implements RequiredCovariate { - public static final String defaultReadGroup = "DefaultReadGroup"; - // Initialize any member variables using the command-line arguments passed to the walkers @Override - public void initialize( final RecalibrationArgumentCollection RAC ) { + public void initialize(final RecalibrationArgumentCollection RAC) { } @Override - public void getValues( final SAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType ) { + public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) { final String readGroupId = read.getReadGroup().getReadGroupId(); - for(int i = 0; i < read.getReadLength(); i++) { + for (int i = 0; i < read.getReadLength(); i++) { comparable[i] = readGroupId; } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @Override - public final Comparable getValue( final String str ) { + public final Comparable getValue(final String str) { return str; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index be02063de..18b33c0e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; @@ -67,22 +66,36 @@ public class RecalDataManager { private static boolean warnUserNullPlatform = false; public enum SOLID_RECAL_MODE { - /** Treat reference inserted bases as reference matching bases. Very unsafe! */ + /** + * Treat reference inserted bases as reference matching bases. Very unsafe! + */ DO_NOTHING, - /** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */ + /** + * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. + */ SET_Q_ZERO, - /** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */ + /** + * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. + */ SET_Q_ZERO_BASE_N, - /** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */ + /** + * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. + */ REMOVE_REF_BIAS } public enum SOLID_NOCALL_STRATEGY { - /** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */ + /** + * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. + */ THROW_EXCEPTION, - /** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */ + /** + * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. + */ LEAVE_READ_UNRECALIBRATED, - /** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */ + /** + * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. + */ PURGE_READ } @@ -93,16 +106,17 @@ public class RecalDataManager { dataCollapsedByCovariate = null; } - public RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) { + if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker data = null; dataCollapsedReadGroup = new NestedHashMap(); dataCollapsedQualityScore = new NestedHashMap(); dataCollapsedByCovariate = new ArrayList(); - for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate - dataCollapsedByCovariate.add( new NestedHashMap() ); + for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate + dataCollapsedByCovariate.add(new NestedHashMap()); } - } else { + } + else { data = new NestedHashMap(); dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; @@ -112,54 +126,58 @@ public class RecalDataManager { /** * Add the given mapping to all of the collapsed hash tables - * @param key The list of comparables that is the key for this mapping - * @param fullDatum The RecalDatum which is the data for this mapping + * + * @param key The list of comparables that is the key for this mapping + * @param fullDatum The RecalDatum which is the data for this mapping * @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table */ - public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) { + public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) { // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around //data.put(key, thisDatum); // add the mapping to the main table - final int qualityScore = Integer.parseInt( key[1].toString() ); + final int qualityScore = Integer.parseInt(key[1].toString()); final Object[] readGroupCollapsedKey = new Object[1]; final Object[] qualityScoreCollapsedKey = new Object[2]; final Object[] covariateCollapsedKey = new Object[3]; RecalDatum collapsedDatum; // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed - if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) { + if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) { readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group - collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey ); - } else { - collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported + collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey); + } + else { + collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported } } // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ... qualityScoreCollapsedKey[1] = key[1]; // and quality score - collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey ); - } else { - collapsedDatum.increment( fullDatum ); + collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey); + } + else { + collapsedDatum.increment(fullDatum); } // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { + for (int iii = 0; iii < dataCollapsedByCovariate.size(); iii++) { covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ... covariateCollapsedKey[1] = key[1]; // and quality score ... final Object theCovariateElement = key[iii + 2]; // and the given covariate - if( theCovariateElement != null ) { + if (theCovariateElement != null) { covariateCollapsedKey[2] = theCovariateElement; - collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey ); - } else { - collapsedDatum.increment( fullDatum ); + collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey); + if (collapsedDatum == null) { + dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey); + } + else { + collapsedDatum.increment(fullDatum); } } } @@ -167,150 +185,162 @@ public class RecalDataManager { /** * Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score - * that will be used in the sequential calculation in TableRecalibrationWalker + * that will be used in the sequential calculation in TableRecalibrationWalker + * * @param smoothing The smoothing parameter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores + * @param maxQual At which value to cap the quality scores */ - public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { + public final void generateEmpiricalQualities(final int smoothing, final int maxQual) { recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual); recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual); - for( NestedHashMap map : dataCollapsedByCovariate ) { + for (NestedHashMap map : dataCollapsedByCovariate) { recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual); checkForSingletons(map.data); } } - private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) { + private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) { - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - ((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual); - } else { // Another layer in the nested hash map - recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual); + if (val instanceof RecalDatum) { // We are at the end of the nested hash maps + ((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual); + } + else { // Another layer in the nested hash map + recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual); } } } - private void checkForSingletons( final Map data ) { + private void checkForSingletons(final Map data) { // todo -- this looks like it's better just as a data.valueSet() call? - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps - if( data.keySet().size() == 1) { + if (val instanceof RecalDatum) { // We are at the end of the nested hash maps + if (data.keySet().size() == 1) { data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ... - // in a previous step of the sequential calculation model + // in a previous step of the sequential calculation model } - } else { // Another layer in the nested hash map - checkForSingletons( (Map) val ); + } + else { // Another layer in the nested hash map + checkForSingletons((Map) val); } } } /** * Get the appropriate collapsed table out of the set of all the tables held by this Object + * * @param covariate Which covariate indexes the desired collapsed HashMap * @return The desired collapsed HashMap */ - public final NestedHashMap getCollapsedTable( final int covariate ) { - if( covariate == 0) { + public final NestedHashMap getCollapsedTable(final int covariate) { + if (covariate == 0) { return dataCollapsedReadGroup; // Table where everything except read group has been collapsed - } else if( covariate == 1 ) { + } + else if (covariate == 1) { return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - } else { - return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed + } + else { + return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed } } /** * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * * @param read The read to adjust - * @param RAC The list of shared command line arguments + * @param RAC The list of shared command line arguments */ - public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { - GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord)read).getReadGroup(); + public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments - if( readGroup == null ) { - if( RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { - if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { + if (readGroup == null) { + if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) { Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); + "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); warnUserNullReadGroup = true; } // There is no readGroup so defaulting to these values - readGroup = new GATKSAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); - readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); - ((GATKSAMRecord)read).setReadGroup( readGroup ); - } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() ); + readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP); + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + ((GATKSAMRecord) read).setReadGroup(readGroup); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName()); } } - if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user + if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user final String oldPlatform = readGroup.getPlatform(); - readGroup = new GATKSAMReadGroupRecord( RAC.FORCE_READ_GROUP ); - readGroup.setPlatform( oldPlatform ); - ((GATKSAMRecord)read).setReadGroup( readGroup ); + readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP); + readGroup.setPlatform(oldPlatform); + ((GATKSAMRecord) read).setReadGroup(readGroup); } - if( RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { - readGroup.setPlatform( RAC.FORCE_PLATFORM ); + if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform(RAC.FORCE_PLATFORM); } - if ( readGroup.getPlatform() == null ) { - if( RAC.DEFAULT_PLATFORM != null ) { - if( !warnUserNullPlatform ) { + if (readGroup.getPlatform() == null) { + if (RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullPlatform) { Utils.warnUser("The input .bam file contains reads with no platform information. " + - "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); warnUserNullPlatform = true; } - readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); - } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() ); + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); } } } /** * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space + * * @param read The SAMRecord to parse */ - public static void parseColorSpace( final SAMRecord read ) { + public static void parseColorSpace(final GATKSAMRecord read) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read + if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { + if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).getBytes(); - } else { + if (attr instanceof String) { + colorSpace = ((String) attr).getBytes(); + } + else { throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); } // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); } final byte[] inconsistency = new byte[readBases.length]; int iii; byte prevBase = colorSpace[0]; // The sentinel - for( iii = 0; iii < readBases.length; iii++ ) { - final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); - inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 ); + for (iii = 0; iii < readBases.length; iii++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); + inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1); prevBase = readBases[iii]; } - read.setAttribute( RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency ); + read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); - } else { + } + else { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); } } } @@ -319,52 +349,57 @@ public class RecalDataManager { /** * Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases * This method doesn't add the inconsistent tag to the read like parseColorSpace does - * @param read The SAMRecord to parse + * + * @param read The SAMRecord to parse * @param originalQualScores The array of original quality scores to modify during the correction - * @param solidRecalMode Which mode of solid recalibration to apply - * @param refBases The reference for this read + * @param solidRecalMode Which mode of solid recalibration to apply + * @param refBases The reference for this read * @return A new array of quality scores that have been ref bias corrected */ - public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases ) { + public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).getBytes(); - } else { + if (attr instanceof String) { + colorSpace = ((String) attr).getBytes(); + } + else { throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); } // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); final byte[] colorImpliedBases = readBases.clone(); - byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray( read.getCigar(), read.getReadBases(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); - refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() ); + byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone()); } final int[] inconsistency = new int[readBases.length]; byte prevBase = colorSpace[0]; // The sentinel - for( int iii = 0; iii < readBases.length; iii++ ) { - final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] ); + for (int iii = 0; iii < readBases.length; iii++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]); colorImpliedBases[iii] = thisBase; - inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 ); + inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1); prevBase = readBases[iii]; } // Now that we have the inconsistency array apply the desired correction to the inconsistent bases - if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO ) { // Set inconsistent bases and the one before it to Q0 + if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0 final boolean setBaseN = false; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } else if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N ) { + } + else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) { final boolean setBaseN = true; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); - } else if( solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases + } + else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead); } - } else { + } + else { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); } @@ -372,26 +407,28 @@ public class RecalDataManager { return originalQualScores; } - public static boolean checkNoCallColorSpace( final SAMRecord read ) { - if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) { + public static boolean checkNoCallColorSpace(final GATKSAMRecord read) { + if (read.getReadGroup().getPlatform().toUpperCase().contains("SOLID")) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpace; - if( attr instanceof String ) { - colorSpace = ((String)attr).substring(1).getBytes(); // trim off the Sentinel - } else { + if (attr instanceof String) { + colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel + } + else { throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); } - for( byte color : colorSpace ) { - if( color != (byte)'0' && color != (byte)'1' && color != (byte)'2' && color != (byte)'3' ) { + for (byte color : colorSpace) { + if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { return true; // There is a bad color in this SOLiD read and the user wants to skip over it } } - } else { + } + else { throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + - " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); } } @@ -400,90 +437,105 @@ public class RecalDataManager { /** * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param originalQualScores The array of original quality scores to set to zero if needed - * @param refBases The reference which has been RC'd if necessary - * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar + * @param refBases The reference which has been RC'd if necessary + * @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar * @return The byte array of original quality scores some of which might have been set to zero */ - private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, - final byte[] refBases, final boolean setBaseN ) { + private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) { final boolean negStrand = read.getReadNegativeStrandFlag(); - for( int iii = 1; iii < originalQualScores.length; iii++ ) { - if( inconsistency[iii] == 1 ) { - if( readBases[iii] == refBases[iii] ) { - if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; } - else { originalQualScores[iii] = (byte)0; } - if( setBaseN ) { readBases[iii] = (byte)'N'; } + for (int iii = 1; iii < originalQualScores.length; iii++) { + if (inconsistency[iii] == 1) { + if (readBases[iii] == refBases[iii]) { + if (negStrand) { + originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0; + } + else { + originalQualScores[iii] = (byte) 0; + } + if (setBaseN) { + readBases[iii] = (byte) 'N'; + } } // Set the prev base to Q0 as well - if( readBases[iii-1] == refBases[iii-1] ) { - if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; } - else { originalQualScores[iii-1] = (byte)0; } - if( setBaseN ) { readBases[iii-1] = (byte)'N'; } + if (readBases[iii - 1] == refBases[iii - 1]) { + if (negStrand) { + originalQualScores[originalQualScores.length - iii] = (byte) 0; + } + else { + originalQualScores[iii - 1] = (byte) 0; + } + if (setBaseN) { + readBases[iii - 1] = (byte) 'N'; + } } } } - if( negStrand ) { - readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read + if (negStrand) { + readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read } - read.setReadBases( readBases ); + read.setReadBases(readBases); return originalQualScores; } /** * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference - * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read which have been RC'd if necessary - * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color + * + * @param read The SAMRecord to recalibrate + * @param readBases The bases in the read which have been RC'd if necessary + * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param colorImpliedBases The bases implied by the color space, RC'd if necessary - * @param refBases The reference which has been RC'd if necessary + * @param refBases The reference which has been RC'd if necessary */ - private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, - final byte[] refBases) { + private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); - if( attr != null ) { + if (attr != null) { byte[] colorSpaceQuals; - if( attr instanceof String ) { - String x = (String)attr; + if (attr instanceof String) { + String x = (String) attr; colorSpaceQuals = x.getBytes(); SAMUtils.fastqToPhred(colorSpaceQuals); - } else { + } + else { throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); } - for( int iii = 1; iii < inconsistency.length - 1; iii++ ) { - if( inconsistency[iii] == 1 ) { - for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read - if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step - if( readBases[jjj] == refBases[jjj] ) { - if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( 2 ); - if( rand == 0 ) { // The color implied base won the coin flip + for (int iii = 1; iii < inconsistency.length - 1; iii++) { + if (inconsistency[iii] == 1) { + for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read + if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step + if (readBases[jjj] == refBases[jjj]) { + if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2); + if (rand == 0) { // The color implied base won the coin flip readBases[jjj] = colorImpliedBases[jjj]; } - } else { - final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); - final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); + } + else { + final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); + final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]); int diffInQuality = maxQuality - minQuality; int numLow = minQuality; - if( numLow == 0 ) { + if (numLow == 0) { numLow++; diffInQuality++; } - final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely - final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( numLow + numHigh ); - if( rand >= numLow ) { // higher q score won - if( maxQuality == (int)colorSpaceQuals[jjj] ) { + final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely + final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh); + if (rand >= numLow) { // higher q score won + if (maxQuality == (int) colorSpaceQuals[jjj]) { readBases[jjj] = colorImpliedBases[jjj]; } // else ref color had higher q score, and won out, so nothing to do here - } else { // lower q score won - if( minQuality == (int)colorSpaceQuals[jjj] ) { + } + else { // lower q score won + if (minQuality == (int) colorSpaceQuals[jjj]) { readBases[jjj] = colorImpliedBases[jjj]; } // else ref color had lower q score, and won out, so nothing to do here } @@ -494,52 +546,56 @@ public class RecalDataManager { } } - if( read.getReadNegativeStrandFlag() ) { - readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read + if (read.getReadNegativeStrandFlag()) { + readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read } - read.setReadBases( readBases ); - } else { // No color space quality tag in file + read.setReadBases(readBases); + } + else { // No color space quality tag in file throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName()); } } /** * Given the base and the color calculate the next base in the sequence + * * @param prevBase The base - * @param color The color + * @param color The color * @return The next base in the sequence */ - private static byte getNextBaseFromColor( SAMRecord read, final byte prevBase, final byte color ) { - switch(color) { + private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { + switch (color) { case '0': return prevBase; case '1': - return performColorOne( prevBase ); + return performColorOne(prevBase); case '2': - return performColorTwo( prevBase ); + return performColorTwo(prevBase); case '3': - return performColorThree( prevBase ); + return performColorThree(prevBase); default: - throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char)color + - " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); } } /** * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * @param read The read which contains the color space to check against + * + * @param read The read which contains the color space to check against * @param offset The offset in the read at which to check * @return Returns true if the base was inconsistent with the color space */ - public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { + public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); - if( attr != null ) { - final byte[] inconsistency = (byte[])attr; + if (attr != null) { + final byte[] inconsistency = (byte[]) attr; // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! - if( read.getReadNegativeStrandFlag() ) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] != (byte)0; - } else { // Forward direction - return inconsistency[offset] != (byte)0; + if (read.getReadNegativeStrandFlag()) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] != (byte) 0; + } + else { // Forward direction + return inconsistency[offset] != (byte) 0; } // This block of code is for if you want to check both the offset and the next base for color space inconsistency @@ -557,7 +613,8 @@ public class RecalDataManager { // } //} - } else { // No inconsistency array, so nothing is inconsistent + } + else { // No inconsistency array, so nothing is inconsistent return false; } } @@ -566,33 +623,32 @@ public class RecalDataManager { * Computes all requested covariates for every offset in the given read * by calling covariate.getValues(..). * - * @param gatkRead The read for which to compute covariate values. + * @param gatkRead The read for which to compute covariate values. * @param requestedCovariates The list of requested covariates. * @return An array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. */ - public static Comparable[][] computeCovariates( final GATKSAMRecord gatkRead, final List requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType ) { - //compute all covariates for this read - final List requestedCovariatesRef = requestedCovariates; - final int numRequestedCovariates = requestedCovariatesRef.size(); - final int readLength = gatkRead.getReadLength(); + public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType) { + //compute all covariates for this read + final int numRequestedCovariates = requestedCovariates.size(); + final int readLength = gatkRead.getReadLength(); - final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; - final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; + final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; + final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for( int i = 0; i < numRequestedCovariates; i++ ) { - requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder, modelType ); - for(int j = 0; j < readLength; j++) { - //copy values into a 2D array that allows all covar types to be extracted at once for - //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. - covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; - } - } + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for (int i = 0; i < numRequestedCovariates; i++) { + requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType); + for (int j = 0; j < readLength; j++) { + //copy values into a 2D array that allows all covar types to be extracted at once for + //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; + } + } - return covariateValues_offset_x_covar; - } + return covariateValues_offset_x_covar; + } /** * Perform a ceratin transversion (A <-> C or G <-> T) on the base. @@ -603,14 +659,19 @@ public class RecalDataManager { private static byte performColorOne(byte base) { switch (base) { case 'A': - case 'a': return 'C'; + case 'a': + return 'C'; case 'C': - case 'c': return 'A'; + case 'c': + return 'A'; case 'G': - case 'g': return 'T'; + case 'g': + return 'T'; case 'T': - case 't': return 'G'; - default: return base; + case 't': + return 'G'; + default: + return base; } } @@ -623,14 +684,19 @@ public class RecalDataManager { private static byte performColorTwo(byte base) { switch (base) { case 'A': - case 'a': return 'G'; + case 'a': + return 'G'; case 'C': - case 'c': return 'T'; + case 'c': + return 'T'; case 'G': - case 'g': return 'A'; + case 'g': + return 'A'; case 'T': - case 't': return 'C'; - default: return base; + case 't': + return 'C'; + default: + return base; } } @@ -643,14 +709,19 @@ public class RecalDataManager { private static byte performColorThree(byte base) { switch (base) { case 'A': - case 'a': return 'T'; + case 'a': + return 'T'; case 'C': - case 'c': return 'G'; + case 'c': + return 'G'; case 'G': - case 'g': return 'C'; + case 'g': + return 'C'; case 'T': - case 't': return 'A'; - default: return base; + case 't': + return 'A'; + default: + return base; } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 75de84cb4..ffdb0cca7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -43,36 +43,36 @@ public class RecalibrationArgumentCollection { // Shared Command Line Arguments ////////////////////////////////// @Hidden - @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") + @Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = null; @Hidden - @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @Hidden - @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") + @Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") public String FORCE_READ_GROUP = null; @Hidden - @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; @Hidden - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) + @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false) public int WINDOW_SIZE = 5; /** * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. */ @Hidden - @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) + @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false) public int HOMOPOLYMER_NBACK = 7; @Hidden - @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) + @Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false) public boolean EXCEPTION_IF_NO_TILE = false; /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. */ - @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; /** @@ -80,6 +80,13 @@ public class RecalibrationArgumentCollection { * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in * their color space tag can not be recalibrated. */ - @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false) + @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + + /** + * The context covariate will use a context of this size to calculate it's covariate value + */ + @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false) + int CONTEXT_SIZE = 8; + }