The CycleCovariate for 454 data is now the TACG flow cycle. That is, each flow grabs all the T's, A's, C's, and G's in order in a single cycle. This is changed from incrementing the cycle whenever there is a discontinuous nucleotide along the direction of the read.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2408 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-12-18 22:39:51 +00:00
parent c39675d2c1
commit fdf542c214
3 changed files with 32 additions and 20 deletions

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import net.sf.samtools.SAMRecord;
/*
@ -36,9 +37,9 @@ import net.sf.samtools.SAMRecord;
*
* The Cycle covariate.
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
* For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read
* For example, for the read: AAACCCCGAAATTTTTTT
* the cycle would be 111222234445555555
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
* For example, for the read: AAACCCCGAAATTTTTACT
* the cycle would be 1111111122233333334
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
*/
@ -78,23 +79,34 @@ public class CycleCovariate implements StandardCovariate {
else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
int cycle = 0;
byte[] bases = read.getReadBases();
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
if( !read.getReadNegativeStrandFlag() ) { // forward direction
byte prevBase = bases[0];
for( int iii = 1; iii <= offset; iii++ ) {
if( bases[iii] != prevBase ) { // This base doesn't match the previous one so it is a new cycle
cycle++;
prevBase = bases[iii];
}
int iii = 0;
while( iii <= offset )
{
while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; }
while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; }
while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; }
while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; }
if( iii <= offset ) { cycle++; }
if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; }
}
} else { // negative direction
byte prevBase = bases[bases.length-1];
for( int iii = bases.length-2; iii >= offset; iii-- ) {
if( bases[iii] != prevBase ) { // This base doesn't match the previous one so it is a new cycle
cycle++;
prevBase = bases[iii];
}
int iii = bases.length-1;
while( iii >= offset )
{
while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; }
while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; }
while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; }
while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; }
if( iii >= offset ) { cycle++; }
if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; }
}
}
return cycle;
}

View File

@ -59,7 +59,7 @@ public class RecalibrationArgumentCollection {
public int WINDOW_SIZE = 5;
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
public int HOMOPOLYMER_NBACK = 7;
public boolean checkSolidRecalMode() {
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||
SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") );

View File

@ -18,8 +18,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "16f87c9644b27c3c3fe7a963eed45d2d" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5a4873064e602e1f945f3881c6b4a3e5");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b039f405580ad844cd1dfd4d8ba25913" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dbbb9c49f2e0b1f117b88eb386fb6d0f" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "42f78368cc85eb3963329ad48c244e80" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f97ec61e98db1a580900d9f4985080d0" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -51,8 +51,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "1f1770363f381d06e8db9fead03ad7bb");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "3fe6ee95cfa6155f8cc8e3e0a6330ad2" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "732267e9cd02a72b8c85238f47487509" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "563cdb25b5a963121299ad319d5ec2ae" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dd8b61060914b75199ca5babc60248fa" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();