Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2012-04-04 20:17:18 -04:00
commit 6913710e89
12 changed files with 238 additions and 116 deletions

View File

@ -165,6 +165,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
private boolean requireStrictAlleleMatch = false;
@Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
private boolean keepSitesWithAC0 = false;
/**
* If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
* variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
@ -580,4 +583,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public GenomeAnalysisEngine getToolkit() {
return super.getToolkit();
}
public boolean ignoreAC0Sites() {
return ! keepSitesWithAC0;
}
}

View File

@ -93,7 +93,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
// added in a proxy check for monomorphic status here.
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
if ( vc1.isMonomorphicInSamples() ) {
if ( getWalker().ignoreAC0Sites() && vc1.isMonomorphicInSamples() ) {
nRefLoci++;
} else {
switch (vc1.getType()) {

View File

@ -53,6 +53,7 @@ public class IndelLengthHistogram extends VariantEvaluator implements StandardEv
public TreeMap<Object, Object> results;
public final static int MAX_SIZE_FOR_HISTOGRAM = 10;
private final static boolean INCLUDE_LONG_EVENTS_AT_MAX_SIZE = false;
public IndelLengthHistogram() {
initializeCounts(MAX_SIZE_FOR_HISTOGRAM);
@ -85,10 +86,13 @@ public class IndelLengthHistogram extends VariantEvaluator implements StandardEv
@Override
public void update1(final VariantContext eval, final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if ( eval.isIndel() && ! eval.isComplexIndel() ) {
for ( Allele alt : eval.getAlternateAlleles() ) {
final int alleleSize = alt.length() - eval.getReference().length();
if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
updateLengthHistogram(eval.getReference(), alt);
if ( ! ( getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples() )) {
// only if we are actually polymorphic in the subsetted samples should we count the allele
for ( Allele alt : eval.getAlternateAlleles() ) {
final int alleleSize = alt.length() - eval.getReference().length();
if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
updateLengthHistogram(eval.getReference(), alt);
}
}
}
}
@ -96,16 +100,22 @@ public class IndelLengthHistogram extends VariantEvaluator implements StandardEv
/**
* Update the histogram with the implied length of the indel allele between ref and alt (alt.len - ref.len).
*
* If this size is outside of MAX_SIZE_FOR_HISTOGRAM, the size is capped to MAX_SIZE_FOR_HISTOGRAM
* If this size is outside of MAX_SIZE_FOR_HISTOGRAM, the size is capped to MAX_SIZE_FOR_HISTOGRAM,
* if INCLUDE_LONG_EVENTS_AT_MAX_SIZE is set.
*
* @param ref
* @param alt
*/
public void updateLengthHistogram(final Allele ref, final Allele alt) {
int len = alt.length() - ref.length();
if ( len > MAX_SIZE_FOR_HISTOGRAM ) len = MAX_SIZE_FOR_HISTOGRAM;
if ( len < -MAX_SIZE_FOR_HISTOGRAM ) len = -MAX_SIZE_FOR_HISTOGRAM;
if ( INCLUDE_LONG_EVENTS_AT_MAX_SIZE ) {
if ( len > MAX_SIZE_FOR_HISTOGRAM ) len = MAX_SIZE_FOR_HISTOGRAM;
if ( len < -MAX_SIZE_FOR_HISTOGRAM ) len = -MAX_SIZE_FOR_HISTOGRAM;
}
if ( Math.abs(len) > MAX_SIZE_FOR_HISTOGRAM )
return;
nIndels++;
counts.put(len, counts.get(len) + 1);
}

View File

@ -134,7 +134,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@Override public int getComparisonOrder() { return 2; }
public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || eval.isMonomorphicInSamples() )
if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) )
return;
// update counts

View File

@ -92,7 +92,7 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva
@Override public int getComparisonOrder() { return 2; }
public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || eval.isMonomorphicInSamples() )
if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) )
return;
// update counts

View File

@ -33,7 +33,7 @@ public class ThetaVariantEvaluator extends VariantEvaluator {
}
public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphicInSamples()) {
if (vc == null || !vc.isSNP() || (getWalker().ignoreAC0Sites() && vc.isMonomorphicInSamples())) {
return;
}

View File

@ -207,7 +207,8 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
}
public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || eval.isMonomorphicInSamples() ) return;
if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) )
return;
final Type type = getType(eval);

View File

@ -31,6 +31,7 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
@ -41,6 +42,8 @@ public class Haplotype {
private GenomeLoc genomeLocation = null;
private HashMap<String, double[]> readLikelihoodsPerSample = null;
private boolean isRef = false;
private Cigar cigar;
private int alignmentStartHapwrtRef;
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
@ -112,11 +115,7 @@ public class Haplotype {
@Override
public String toString() {
String returnString = "";
for(int iii = 0; iii < bases.length; iii++) {
returnString += (char) bases[iii];
}
return returnString;
return new String(bases);
}
public double[] getQuals() {
@ -134,11 +133,27 @@ public class Haplotype {
return genomeLocation.getStop();
}
@Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"})
public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) {
public int getAlignmentStartHapwrtRef() {
return alignmentStartHapwrtRef;
}
public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) {
this.alignmentStartHapwrtRef = alignmentStartHapwrtRef;
}
public Cigar getCigar() {
return cigar;
}
public void setCigar( final Cigar cigar ) {
this.cigar = cigar;
}
@Requires({"refInsertLocation >= 0"})
public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation ) {
if( refAllele.length() != altAllele.length() ) { refInsertLocation++; }
int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStartInRefCoords, haplotypeCigar, refInsertLocation);
int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype
return bases.clone();
}
@ -233,85 +248,4 @@ public class Haplotype {
return haplotypeMap;
}
// BUGBUG: copied from ReadClipper and slightly modified since we don't have the data in a GATKSAMRecord
private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) {
int readBases = 0;
int refBases = 0;
boolean fallsInsideDeletion = false;
int goal = refCoord - haplotypeStart; // The goal is to move this many reference bases
boolean goalReached = refBases == goal;
Iterator<CigarElement> cigarElementIterator = haplotypeCigar.getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
if (refBases + cigarElement.getLength() < goal)
shift = cigarElement.getLength();
else
shift = goal - refBases;
refBases += shift;
}
goalReached = refBases == goal;
if (!goalReached && cigarElement.getOperator().consumesReadBases())
readBases += cigarElement.getLength();
if (goalReached) {
// Is this base's reference position within this cigar element? Or did we use it all?
boolean endsWithinCigar = shift < cigarElement.getLength();
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext())
return -1;
CigarElement nextCigarElement;
// if we end inside the current cigar element, we just have to check if it is a deletion
if (endsWithinCigar)
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
// if we end outside the current cigar element, we need to check if the next element is an insertion or deletion.
else {
nextCigarElement = cigarElementIterator.next();
// if it's an insertion, we need to clip the whole insertion before looking at the next element
if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
readBases += nextCigarElement.getLength();
if (!cigarElementIterator.hasNext())
return -1;
nextCigarElement = cigarElementIterator.next();
}
// if it's a deletion, we will pass the information on to be handled downstream.
fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
}
// If we reached our goal outside a deletion, add the shift
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
readBases += shift;
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar)
readBases += shift - 1;
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion
else if (fallsInsideDeletion && endsWithinCigar)
readBases--;
}
}
if (!goalReached)
return -1;
return (fallsInsideDeletion ? -1 : readBases);
}
}

View File

@ -49,6 +49,7 @@ public class ReadUtils {
}
private static int DEFAULT_ADAPTOR_SIZE = 100;
public static int CLIPPING_GOAL_NOT_REACHED = -1;
/**
* A marker to tell which end of the read has been clipped
@ -362,7 +363,11 @@ public class ReadUtils {
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) {
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord);
return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false);
}
public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) {
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached);
int readCoord = result.getFirst();
// Corner case one: clipping the right tail and falls on deletion, move to the next
@ -374,9 +379,9 @@ public class ReadUtils {
// clipping the left tail and first base is insertion, go to the next read coordinate
// with the same reference coordinate. Advance to the next cigar element, or to the
// end of the read if there is no next element.
Pair<Boolean, CigarElement> firstElementIsInsertion = readStartsWithInsertion(read);
Pair<Boolean, CigarElement> firstElementIsInsertion = readStartsWithInsertion(cigar);
if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst())
readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1);
readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), cigar.getReadLength() - 1);
return readCoord;
}
@ -400,14 +405,25 @@ public class ReadUtils {
@Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"})
@Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"})
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) {
return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false);
}
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) {
int readBases = 0;
int refBases = 0;
boolean fallsInsideDeletion = false;
int goal = refCoord - read.getSoftStart(); // The goal is to move this many reference bases
int goal = refCoord - alignmentStart; // The goal is to move this many reference bases
if (goal < 0) {
if (allowGoalNotReached) {
return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
} else {
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
}
}
boolean goalReached = refBases == goal;
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
@ -431,8 +447,13 @@ public class ReadUtils {
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
if (!endsWithinCigar && !cigarElementIterator.hasNext()) {
if (allowGoalNotReached) {
return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
} else {
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
}
}
CigarElement nextCigarElement;
@ -447,8 +468,13 @@ public class ReadUtils {
// if it's an insertion, we need to clip the whole insertion before looking at the next element
if (nextCigarElement.getOperator() == CigarOperator.INSERTION) {
readBases += nextCigarElement.getLength();
if (!cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
if (!cigarElementIterator.hasNext()) {
if (allowGoalNotReached) {
return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
} else {
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
}
}
nextCigarElement = cigarElementIterator.next();
}
@ -473,8 +499,13 @@ public class ReadUtils {
}
}
if (!goalReached)
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
if (!goalReached) {
if (allowGoalNotReached) {
return new Pair<Integer, Boolean>(CLIPPING_GOAL_NOT_REACHED, false);
} else {
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
}
}
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
}
@ -527,7 +558,11 @@ public class ReadUtils {
* @return A pair with the answer (true/false) and the element or null if it doesn't exist
*/
public static Pair<Boolean, CigarElement> readStartsWithInsertion(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
return readStartsWithInsertion(read.getCigar());
}
public static Pair<Boolean, CigarElement> readStartsWithInsertion(final Cigar cigar) {
for (CigarElement cigarElement : cigar.getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.INSERTION)
return new Pair<Boolean, CigarElement>(true, cigarElement);

View File

@ -488,7 +488,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s"
),
1,
Arrays.asList("7c01565531cf82c8c03cf042903b96cf")
Arrays.asList("41a37636868a838a632559949c5216cf")
);
executeTest("testModernVCFWithLargeIndels", spec);
}
@ -506,4 +506,21 @@ public class VariantEvalIntegrationTest extends WalkerTest {
UserException.class);
executeTest("testIncompatibleEvalAndStrat", spec);
}
public void testIncludingAC0(boolean includeAC0, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T VariantEval",
"-R " + b37KGReference,
"-eval " + testDir + "/ac0.vcf",
"-L 20:81006 -noST -noEV -EV VariantSummary -o %s" + (includeAC0 ? " -keepAC0" : "")
),
1,
Arrays.asList(md5));
executeTest("testIncludingAC0 keep ac 0 = " + includeAC0, spec);
}
@Test public void testWithAC0() { testIncludingAC0(true, "0ed2c8e4b4e06973a06838bc930a132d"); }
@Test public void testWithoutAC0() { testIncludingAC0(false, "79d28ddd0ab9584776b6cbefe48331df"); }
}

View File

@ -152,7 +152,9 @@ public class HaplotypeUnitTest extends BaseTest {
final Haplotype h = new Haplotype(hap.getBytes());
final Allele h1refAllele = Allele.create(ref, true);
final Allele h1altAllele = Allele.create(alt, false);
final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE, 0, cigar) );
h.setAlignmentStartHapwrtRef(0);
h.setCigar(cigar);
final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE) );
final Haplotype h1expected = new Haplotype(newHap.getBytes());
Assert.assertEquals(h1, h1expected);
}

116
public/testdata/ac0.vcf vendored 100644

File diff suppressed because one or more lines are too long