The CycleCovariate for 454 data is now the TACG flow cycle. That is, each flow grabs all the T's, A's, C's, and G's in order in a single cycle. This is changed from incrementing the cycle whenever there is a discontinuous nucleotide along the direction of the read.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2408 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c39675d2c1
commit
fdf542c214
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/*
|
||||
|
|
@ -36,9 +37,9 @@ import net.sf.samtools.SAMRecord;
|
|||
*
|
||||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read
|
||||
* For example, for the read: AAACCCCGAAATTTTTTT
|
||||
* the cycle would be 111222234445555555
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACT
|
||||
* the cycle would be 1111111122233333334
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
|
|
@ -78,23 +79,34 @@ public class CycleCovariate implements StandardCovariate {
|
|||
else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
|
||||
int cycle = 0;
|
||||
byte[] bases = read.getReadBases();
|
||||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
if( !read.getReadNegativeStrandFlag() ) { // forward direction
|
||||
byte prevBase = bases[0];
|
||||
for( int iii = 1; iii <= offset; iii++ ) {
|
||||
if( bases[iii] != prevBase ) { // This base doesn't match the previous one so it is a new cycle
|
||||
cycle++;
|
||||
prevBase = bases[iii];
|
||||
}
|
||||
int iii = 0;
|
||||
while( iii <= offset )
|
||||
{
|
||||
while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; }
|
||||
while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; }
|
||||
while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; }
|
||||
while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; }
|
||||
if( iii <= offset ) { cycle++; }
|
||||
if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; }
|
||||
|
||||
}
|
||||
} else { // negative direction
|
||||
byte prevBase = bases[bases.length-1];
|
||||
for( int iii = bases.length-2; iii >= offset; iii-- ) {
|
||||
if( bases[iii] != prevBase ) { // This base doesn't match the previous one so it is a new cycle
|
||||
cycle++;
|
||||
prevBase = bases[iii];
|
||||
}
|
||||
int iii = bases.length-1;
|
||||
while( iii >= offset )
|
||||
{
|
||||
while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; }
|
||||
while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; }
|
||||
while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; }
|
||||
while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; }
|
||||
if( iii >= offset ) { cycle++; }
|
||||
if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; }
|
||||
}
|
||||
}
|
||||
|
||||
return cycle;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ public class RecalibrationArgumentCollection {
|
|||
public int WINDOW_SIZE = 5;
|
||||
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
|
||||
public int HOMOPOLYMER_NBACK = 7;
|
||||
|
||||
|
||||
public boolean checkSolidRecalMode() {
|
||||
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||
|
||||
SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") );
|
||||
|
|
|
|||
|
|
@ -18,8 +18,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "16f87c9644b27c3c3fe7a963eed45d2d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5a4873064e602e1f945f3881c6b4a3e5");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b039f405580ad844cd1dfd4d8ba25913" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dbbb9c49f2e0b1f117b88eb386fb6d0f" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "42f78368cc85eb3963329ad48c244e80" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f97ec61e98db1a580900d9f4985080d0" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -51,8 +51,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "1f1770363f381d06e8db9fead03ad7bb");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "3fe6ee95cfa6155f8cc8e3e0a6330ad2" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "732267e9cd02a72b8c85238f47487509" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "563cdb25b5a963121299ad319d5ec2ae" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dd8b61060914b75199ca5babc60248fa" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue