Do not allow the use of Ns in reads for graph construction.

Ns are treated as wildcards in the PairHMM so creating haplotypes with Ns gives them artificial advantages over other ones.
This was the cause of at least one FN where there were Ns at a SNP position.
This commit is contained in:
Eric Banks 2013-05-30 14:00:43 -04:00
parent e4e7d39e2c
commit 77868d034f
3 changed files with 41 additions and 6 deletions

View File

@ -50,6 +50,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.collections.PrimitivePair;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -611,7 +612,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
int lastGood = -1; // the index of the last good base we've seen
for( int end = 0; end <= sequence.length; end++ ) {
if ( end == sequence.length || qualities[end] < minBaseQualityToUseInAssembly ) {
if ( end == sequence.length || ! baseIsUsableForAssembly(sequence[end], qualities[end]) ) {
// the first good base is at lastGood, can be -1 if last base was bad
final int start = lastGood;
// the stop base is end - 1 (if we're not at the end of the sequence)
@ -631,6 +632,18 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
}
}
/**
* Determines whether a base can safely be used for assembly.
* Currently disallows Ns and/or those with low quality
*
* @param base the base under consideration
* @param qual the quality of that base
* @return true if the base can be used for assembly, false otherwise
*/
protected boolean baseIsUsableForAssembly(final byte base, final byte qual) {
return base != BaseUtils.Base.N.base && qual >= minBaseQualityToUseInAssembly;
}
/**
* Get the set of non-unique kmers in this graph. For debugging purposes
* @return a non-null set of kmers

View File

@ -251,7 +251,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest {
for ( int snpPos = 0; snpPos < windowSize; snpPos++) {
if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) {
final byte[] altBases = ref.getBytes();
altBases[snpPos] = 'N';
altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A';
final String alt = new String(altBases);
tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt});
}

View File

@ -48,10 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
@ -180,7 +178,31 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
Assert.assertFalse(rtgraph75.hasCycles());
}
// TODO -- update to use determineKmerSizeAndNonUniques directly
@Test(enabled = !DEBUG)
public void testNsInReadsAreNotUsedForGraph() {
final int length = 100;
final byte[] ref = Utils.dupBytes((byte)'A', length);
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25);
rtgraph.addSequence("ref", ref, null, true);
// add reads with Ns at any position
for ( int i = 0; i < length; i++ ) {
final byte[] bases = ref.clone();
bases[i] = 'N';
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, Utils.dupBytes((byte) 30, length), length + "M");
rtgraph.addRead(read);
}
rtgraph.buildGraphIfNecessary();
final SeqGraph graph = rtgraph.convertToSequenceGraph();
final KBestPaths<SeqVertex,BaseEdge> pathFinder = new KBestPaths<>(false);
Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1);
}
// TODO -- update to use determineKmerSizeAndNonUniques directly
// @DataProvider(name = "KmerSizeData")
// public Object[][] makeKmerSizeDataProvider() {
// List<Object[]> tests = new ArrayList<Object[]>();