New GATKSAMRecord concept of a strandless read, update to FS

-- Strandless GATK reads are ones where they don't really have a meaningful strand value, such as Reduced Reads or fragment merged reads.  Added GATKSAMRecord support for such reads, along with unit tests
-- The merge overlapping fragments code in FragmentUtils now produces strandless merged fragments
-- FisherStrand annotation generalized to treat strandless as providing 1/2 the representative count for both strands.  This means that that merged fragments are properly handled from the HC, so we don't hallucinate fake strand-bias just because we managed to merge a lot of reads together.
-- The previous getReducedCount() wouldn't work if a read was made into a reduced read after getReducedCount() had been called.  Added new GATKSAMRecord method setReducedCounts() that does the right thing.  Updated SlidingWindow and SyntheticRead to explicitly call this function, and so the readTag parameter is now gone.
-- Update MD5s for change to FS calculation.  Differences are just minor updates to the FS
This commit is contained in:
Mark DePristo 2013-03-11 14:54:20 -04:00
parent 925846c65f
commit b5b63eaac7
10 changed files with 117 additions and 30 deletions

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import cern.jet.math.Arithmetic;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -74,6 +75,8 @@ import java.util.*;
* calculated for certain complex indel cases or for multi-allelic sites.
*/
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(FisherStrand.class);
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
@ -95,6 +98,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
// either SNP with no alignment context, or indels: per-read likelihood map needed
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
// logger.info("VC " + vc);
// printTable(table, 0.0);
return pValueForBestTable(table, null);
}
else
@ -131,9 +136,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
private Map<String, Object> annotationForOneTable(final double pValue) {
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
return Collections.singletonMap(FS, value);
// Map<String, Object> map = new HashMap<String, Object>();
// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)));
// return map;
}
public List<String> getKeyNames() {
@ -192,7 +194,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
private static void printTable(int[][] table, double pValue) {
System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue);
logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue));
}
private static boolean rotateTable(int[][] table) {
@ -315,13 +317,21 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
final boolean matchesAlt = allele.equals(alt, true);
if ( matchesRef || matchesAlt ) {
final int row = matchesRef ? 0 : 1;
final boolean isFW = !read.getReadNegativeStrandFlag();
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column] += representativeCount;
if ( read.isStrandless() ) {
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
// if the read is only seen once, because it's a merged read or other)
final int toAdd = Math.max(representativeCount / 2, 1);
table[row][0] += toAdd;
table[row][1] += toAdd;
} else {
// a normal read with an actual strand
final boolean isFW = !read.getReadNegativeStrandFlag();
final int column = isFW ? 0 : 1;
table[row][column] += representativeCount;
}
}
}
}

View File

@ -567,7 +567,7 @@ public class SlidingWindow {
ObjectArrayList<GATKSAMRecord> result = new ObjectArrayList<GATKSAMRecord>();
if (filteredDataConsensus == null)
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand);
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
for (int index = start; index < end; index++) {
@ -583,7 +583,7 @@ public class SlidingWindow {
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
result.add(finalizeFilteredDataConsensus());
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, isNegativeStrand);
}
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
@ -606,7 +606,7 @@ public class SlidingWindow {
@Requires({"start >= 0 && (end >= start || end == 0)"})
private void addToRunningConsensus(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
if (runningConsensus == null)
runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, isNegativeStrand);
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
for (int index = start; index < end; index++) {

View File

@ -124,8 +124,7 @@ public class SyntheticRead {
private final ObjectArrayList<SingleBaseInfo> basesCountsQuals;
private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
private String readTag;
private double mappingQuality;
// Information to produce a GATKSAMRecord
private SAMFileHeader header;
@ -147,14 +146,12 @@ public class SyntheticRead {
* @param contigIndex the read's contig index
* @param readName the read's name
* @param refStart the alignment start (reference based)
* @param readTag the reduce reads tag for the synthetic read
*/
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
final int initialCapacity = 10000;
basesCountsQuals = new ObjectArrayList<SingleBaseInfo>(initialCapacity);
mappingQuality = 0.0;
this.readTag = readTag;
this.header = header;
this.readGroupRecord = readGroupRecord;
this.contig = contig;
@ -165,13 +162,12 @@ public class SyntheticRead {
this.isNegativeStrand = isNegativeRead;
}
public SyntheticRead(ObjectArrayList<BaseIndex> bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
public SyntheticRead(ObjectArrayList<BaseIndex> bases, ByteArrayList counts, ByteArrayList quals, ByteArrayList insertionQuals, ByteArrayList deletionQuals, double mappingQuality, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
basesCountsQuals = new ObjectArrayList<SingleBaseInfo>(bases.size());
for (int i = 0; i < bases.size(); ++i) {
basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i)));
}
this.mappingQuality = mappingQuality;
this.readTag = readTag;
this.header = header;
this.readGroupRecord = readGroupRecord;
this.contig = contig;
@ -228,7 +224,7 @@ public class SyntheticRead {
read.setReadBases(convertReadBases());
read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size()));
read.setReadGroup(readGroupRecord);
read.setAttribute(readTag, convertBaseCounts());
read.setReducedReadCounts(convertBaseCounts());
if (hasIndelQualities) {
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);

View File

@ -77,7 +77,7 @@ public void testBaseCounts() {
new TestRead(bases, quals, new byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
for (TestRead testRead : testReads) {
SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList<BaseIndex>(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
SyntheticRead syntheticRead = new SyntheticRead(new ObjectArrayList<BaseIndex>(testRead.getBases()), new ByteArrayList(testRead.getCounts()), new ByteArrayList(testRead.getQuals()), new ByteArrayList(testRead.getInsQuals()), new ByteArrayList(testRead.getDelQuals()), artificialMappingQuality, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
}
}

View File

@ -63,7 +63,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2232995ca9bec143e664748845a0045");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "b83b53741edb07218045d6f25f20a18b");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "8f33e40686443b9a72de45d5a9da1861");
HCTest(CEUTRIO_BAM, "", "4a2880f0753e6e813b9e0c35209b3708");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "8f2b047cdace0ef122d6ad162e7bc5b9");
HCTest(NA12878_BAM, "", "588892934f2e81247bf32e457db88449");
}
@Test(enabled = false)
@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"9d4be26a2c956ba4b7b4044820eab030");
"fa1b92373c89d2238542a319ad25c257");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("03557376242bdf78c5237703b762573b"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9296f1af6cf1f1cc4b79494eb366e976"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("adb08cb25e902cfe0129404a682b2169"));
Arrays.asList("cf0a1bfded656153578df6cf68aa68a2"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -142,7 +142,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("a43c595a617589388ff3d7e2ddc661e7"));
Arrays.asList("addceb63f5bfa9f11e15335d5bf641e9"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
}

View File

@ -263,6 +263,7 @@ public final class FragmentUtils {
}
final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() );
returnRead.setIsStrandless(true);
returnRead.setAlignmentStart( firstRead.getSoftStart() );
returnRead.setReadBases( bases );
returnRead.setBaseQualities( quals );

View File

@ -74,6 +74,8 @@ public class GATKSAMRecord extends BAMRecord {
private int softEnd = UNINITIALIZED;
private Integer adapterBoundary = null;
private boolean isStrandlessRead = false;
// because some values can be null, we don't want to duplicate effort
private boolean retrievedReadGroup = false;
private boolean retrievedReduceReadCounts = false;
@ -141,6 +143,45 @@ public class GATKSAMRecord extends BAMRecord {
return ArtificialSAMUtils.createArtificialRead(cigar);
}
///////////////////////////////////////////////////////////////////////////////
// *** support for reads without meaningful strand information ***//
///////////////////////////////////////////////////////////////////////////////
/**
* Does this read have a meaningful strandedness value?
*
* Some advanced types of reads, such as reads coming from merged fragments,
* don't have meaningful strandedness values, as they are composites of multiple
* other reads. Strandless reads need to be handled specially by code that cares about
* stranded information, such as FS.
*
* @return true if this read doesn't have meaningful strand information
*/
public boolean isStrandless() {
return isStrandlessRead;
}
/**
* Set the strandless state of this read to isStrandless
* @param isStrandless true if this read doesn't have a meaningful strandedness value
*/
public void setIsStrandless(final boolean isStrandless) {
this.isStrandlessRead = isStrandless;
}
@Override
public boolean getReadNegativeStrandFlag() {
return ! isStrandless() && super.getReadNegativeStrandFlag();
}
@Override
public void setReadNegativeStrandFlag(boolean flag) {
if ( isStrandless() )
throw new IllegalStateException("Cannot set the strand of a strandless read");
super.setReadNegativeStrandFlag(flag);
}
///////////////////////////////////////////////////////////////////////////////
// *** The following methods are overloaded to cache the appropriate data ***//
///////////////////////////////////////////////////////////////////////////////
@ -313,6 +354,15 @@ public class GATKSAMRecord extends BAMRecord {
return getReducedReadCounts() != null;
}
/**
* Set the reduced read counts for this record to counts
* @param counts the count array
*/
public void setReducedReadCounts(final byte[] counts) {
retrievedReduceReadCounts = false;
setAttribute(REDUCED_READ_CONSENSUS_TAG, counts);
}
/**
* The number of bases corresponding the i'th base of the reduced read.
*

View File

@ -229,6 +229,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
if ( expectedMerged == null ) {
Assert.assertNull(actual, "Expected reads not to merge, but got non-null result from merging");
} else {
Assert.assertTrue(actual.isStrandless(), "Merged reads should be strandless");
Assert.assertNotNull(actual, "Expected reads to merge, but got null result from merging");
// I really care about the bases, the quals, the CIGAR, and the read group tag
Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString());

View File

@ -64,6 +64,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
for (int i = 0; i < reducedRead.getReadLength(); i++) {
Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i);
}
Assert.assertEquals(reducedRead.isStrandless(), false, "Reduced reads don't have meaningful strandedness information");
}
@Test
@ -103,7 +104,35 @@ public class GATKSAMRecordUnitTest extends BaseTest {
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null);
Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart());
Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd());
}
@Test
public void testStrandlessReads() {
final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'};
final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 };
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M");
Assert.assertEquals(read.isStrandless(), false);
read.setReadNegativeStrandFlag(false);
Assert.assertEquals(read.isStrandless(), false);
Assert.assertEquals(read.getReadNegativeStrandFlag(), false);
read.setReadNegativeStrandFlag(true);
Assert.assertEquals(read.isStrandless(), false);
Assert.assertEquals(read.getReadNegativeStrandFlag(), true);
read.setReadNegativeStrandFlag(true);
read.setIsStrandless(true);
Assert.assertEquals(read.isStrandless(), true);
Assert.assertEquals(read.getReadNegativeStrandFlag(), false, "negative strand flag should return false even through its set for a strandless read");
}
@Test(expectedExceptions = IllegalStateException.class)
public void testStrandlessReadsFailSetStrand() {
final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'};
final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 };
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M");
read.setIsStrandless(true);
read.setReadNegativeStrandFlag(true);
}
}