diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 225628292..74956feef 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.BaseUtils; import net.sf.samtools.SAMRecord; /* @@ -36,9 +37,9 @@ import net.sf.samtools.SAMRecord; * * The Cycle covariate. * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) - * For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read - * For example, for the read: AAACCCCGAAATTTTTTT - * the cycle would be 111222234445555555 + * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle + * For example, for the read: AAACCCCGAAATTTTTACT + * the cycle would be 1111111122233333334 * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ @@ -78,23 +79,34 @@ public class CycleCovariate implements StandardCovariate { else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" int cycle = 0; byte[] bases = read.getReadBases(); + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one if( !read.getReadNegativeStrandFlag() ) { // forward direction - byte prevBase = bases[0]; - for( int iii = 1; iii <= offset; iii++ ) { - if( bases[iii] != prevBase ) { // This base doesn't match the previous one so it is a new cycle - cycle++; - prevBase = bases[iii]; - } + int iii = 0; + while( iii <= offset ) + { + while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; } + while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; } + while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; } + while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; } + if( iii <= offset ) { cycle++; } + if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; } + } } else { // negative direction - byte prevBase = bases[bases.length-1]; - for( int iii = bases.length-2; iii >= offset; iii-- ) { - if( bases[iii] != prevBase ) { // This base doesn't match the previous one so it is a new cycle - cycle++; - prevBase = bases[iii]; - } + int iii = bases.length-1; + while( iii >= offset ) + { + while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; } + while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; } + while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; } + while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; } + if( iii >= offset ) { cycle++; } + if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; } } } + return cycle; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 780330210..ed355404d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -59,7 +59,7 @@ public class RecalibrationArgumentCollection { public int WINDOW_SIZE = 5; @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) public int HOMOPOLYMER_NBACK = 7; - + public boolean checkSolidRecalMode() { return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index 76567bd35..3436a075e 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -18,8 +18,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { HashMap e = new HashMap(); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "16f87c9644b27c3c3fe7a963eed45d2d" ); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5a4873064e602e1f945f3881c6b4a3e5"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b039f405580ad844cd1dfd4d8ba25913" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dbbb9c49f2e0b1f117b88eb386fb6d0f" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "42f78368cc85eb3963329ad48c244e80" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f97ec61e98db1a580900d9f4985080d0" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -51,8 +51,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { HashMap e = new HashMap(); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" ); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "1f1770363f381d06e8db9fead03ad7bb"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "3fe6ee95cfa6155f8cc8e3e0a6330ad2" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "732267e9cd02a72b8c85238f47487509" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "563cdb25b5a963121299ad319d5ec2ae" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dd8b61060914b75199ca5babc60248fa" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey();