Vast improvements to AssessNA12878 code and functionality
-- AssessNA12878 now breaks out multi-allelics into bi-allelic components. This means that we can properly assess multi-allelic calls against the bi-allelic KB -- Refactor AssessNA12878, moving into assess package in KB. Split out previously private classes in the walker itself into separate classes. Added real docs for all of the classes. -- Vastly expand (from 0) unit tests for NA12878 assessments -- Allow sites only VCs to be evaluated by Assessor -- Move utility for creating simple VCs from a list of string alleles from GATKVariantContextUtilsUnitTest to GATKVariantContextUtils -- Assessor bugfix for discordant records at a site. Previous version didn't handle properly the case where one had a non-matching call in the callset w.r.t. the KB, so that the KB element was eaten during the analysis. Fixed. UnitTested -- See GSA-781 -- Handle multi-allelic variants in KB for more information -- Bugfix for missing site counting in AssessNA12878. Previous version would count N misses for every missed value at a site. Not that this has much impact but it's worth fixing -- UnitTests for BadSitesWriter -- UnitTests for filtered and filtering sites in the Assessor -- Cleanup end report generation code (simply the code). Note that instead of "indel" the new code will print out "INDELS" -- Assessor DoC calculations now us LIBS and RBPs for the depth calculation. The previous version was broken for reduced reads. Added unit test that reads a complex reduced read example and matches the DoC of this BAM with the output of the GATK DoC tool here. -- Added convenience constructor for LIBS using just SAMFileReader and an iterator. It's now easy to create a LIBS from a BAM at a locus. Added advanceToLocus function that moves the LIBS to a specific position. UnitTested via the assessor (which isn't ideal, but is a proper test)
This commit is contained in:
parent
29319bf222
commit
8ac6d3521f
|
|
@ -28,13 +28,18 @@ package org.broadinstitute.sting.utils.locusiterator;
|
||||||
import com.google.java.contract.Ensures;
|
import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMRecordIterator;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||||
|
import org.broadinstitute.sting.gatk.iterators.GATKSAMIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.utils.pileup.*;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
|
|
@ -136,6 +141,25 @@ public final class LocusIteratorByState extends LocusIterator {
|
||||||
readInformation.keepUniqueReadListInLIBS());
|
readInformation.keepUniqueReadListInLIBS());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new LocusIteratorByState based on a SAMFileReader using reads in an iterator it
|
||||||
|
*
|
||||||
|
* Simple constructor that uses the samples in the reader, doesn't do any downsampling,
|
||||||
|
* and makes a new GenomeLocParser using the reader. This constructor will be slow(ish)
|
||||||
|
* if you continually invoke this constructor, but it's easy to make.
|
||||||
|
*
|
||||||
|
* @param reader a non-null reader
|
||||||
|
* @param it an iterator from reader that has the reads we want to use to create ReadBackPileups
|
||||||
|
*/
|
||||||
|
public LocusIteratorByState(final SAMFileReader reader, final SAMRecordIterator it) {
|
||||||
|
this(new GATKSAMIterator(it),
|
||||||
|
new LIBSDownsamplingInfo(false, 0),
|
||||||
|
true,
|
||||||
|
new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()),
|
||||||
|
SampleUtils.getSAMFileSamples(reader.getFileHeader()),
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new LocusIteratorByState
|
* Create a new LocusIteratorByState
|
||||||
*
|
*
|
||||||
|
|
@ -149,7 +173,8 @@ public final class LocusIteratorByState extends LocusIterator {
|
||||||
* be mapped to this null sample
|
* be mapped to this null sample
|
||||||
* @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them
|
* @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them
|
||||||
* available via the transferReadsFromAllPreviousPileups interface
|
* available via the transferReadsFromAllPreviousPileups interface
|
||||||
*/ protected LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
|
*/
|
||||||
|
protected LocusIteratorByState(final Iterator<GATKSAMRecord> samIterator,
|
||||||
final LIBSDownsamplingInfo downsamplingInfo,
|
final LIBSDownsamplingInfo downsamplingInfo,
|
||||||
final boolean includeReadsWithDeletionAtLoci,
|
final boolean includeReadsWithDeletionAtLoci,
|
||||||
final GenomeLocParser genomeLocParser,
|
final GenomeLocParser genomeLocParser,
|
||||||
|
|
@ -221,6 +246,29 @@ public final class LocusIteratorByState extends LocusIterator {
|
||||||
return currentAlignmentContext;
|
return currentAlignmentContext;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move this LIBS until we are over position
|
||||||
|
*
|
||||||
|
* Will return null if cannot reach position (because we run out of data in the locus)
|
||||||
|
*
|
||||||
|
* @param position the start position of the AlignmentContext we want back
|
||||||
|
* @return a AlignmentContext at position, or null if this isn't possible
|
||||||
|
*/
|
||||||
|
public AlignmentContext advanceToLocus(final int position) {
|
||||||
|
while ( hasNext() ) {
|
||||||
|
final AlignmentContext context = next();
|
||||||
|
|
||||||
|
if ( context == null )
|
||||||
|
// we ran out of data
|
||||||
|
return null;
|
||||||
|
|
||||||
|
if ( context.getPosition() == position)
|
||||||
|
return context;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates the next alignment context from the given state. Note that this is implemented as a
|
* Creates the next alignment context from the given state. Note that this is implemented as a
|
||||||
* lazy load method. nextAlignmentContext MUST BE null in order for this method to advance to the
|
* lazy load method. nextAlignmentContext MUST BE null in order for this method to advance to the
|
||||||
|
|
|
||||||
|
|
@ -589,6 +589,8 @@ public class GATKVariantContextUtils {
|
||||||
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
|
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
|
||||||
* SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge.
|
* SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge.
|
||||||
*
|
*
|
||||||
|
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
|
||||||
|
*
|
||||||
* @param unsortedVCs collection of unsorted VCs
|
* @param unsortedVCs collection of unsorted VCs
|
||||||
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
|
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
|
||||||
* @param filteredRecordMergeType merge type for filtered records
|
* @param filteredRecordMergeType merge type for filtered records
|
||||||
|
|
@ -1292,4 +1294,29 @@ public class GATKVariantContextUtils {
|
||||||
return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2));
|
return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For testing purposes only. Create a site-only VariantContext at contig:start containing alleles
|
||||||
|
*
|
||||||
|
* @param name the name of the VC
|
||||||
|
* @param contig the contig for the VC
|
||||||
|
* @param start the start of the VC
|
||||||
|
* @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the
|
||||||
|
* alt. Will compute the stop of the VC from the length of the reference allele
|
||||||
|
* @return a non-null VariantContext
|
||||||
|
*/
|
||||||
|
public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List<String> alleleStrings) {
|
||||||
|
if ( alleleStrings == null || alleleStrings.isEmpty() )
|
||||||
|
throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list");
|
||||||
|
|
||||||
|
final List<Allele> alleles = new LinkedList<Allele>();
|
||||||
|
final int length = alleleStrings.get(0).length();
|
||||||
|
|
||||||
|
boolean first = true;
|
||||||
|
for ( final String alleleString : alleleStrings ) {
|
||||||
|
alleles.add(Allele.create(alleleString, first));
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2012 The Broad Institute
|
* Copyright (c) 2012 The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
* files (the "Software"), to deal in the Software without
|
* files (the "Software"), to deal in the Software without
|
||||||
|
|
@ -9,10 +9,10 @@
|
||||||
* copies of the Software, and to permit persons to whom the
|
* copies of the Software, and to permit persons to whom the
|
||||||
* Software is furnished to do so, subject to the following
|
* Software is furnished to do so, subject to the following
|
||||||
* conditions:
|
* conditions:
|
||||||
*
|
*
|
||||||
* The above copyright notice and this permission notice shall be
|
* The above copyright notice and this permission notice shall be
|
||||||
* included in all copies or substantial portions of the Software.
|
* included in all copies or substantial portions of the Software.
|
||||||
*
|
*
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
|
@ -966,21 +966,12 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test(dataProvider = "ClipAlleleTest")
|
@Test(dataProvider = "ClipAlleleTest")
|
||||||
public void testClipAlleles(final List<String> alleleStrings, final List<String> expected, final int numLeftClipped) {
|
public void testClipAlleles(final List<String> alleleStrings, final List<String> expected, final int numLeftClipped) {
|
||||||
final List<Allele> alleles = new LinkedList<Allele>();
|
|
||||||
final int length = alleleStrings.get(0).length();
|
|
||||||
boolean first = true;
|
|
||||||
for ( final String alleleString : alleleStrings ) {
|
|
||||||
alleles.add(Allele.create(alleleString, first));
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int start = 10;
|
final int start = 10;
|
||||||
final VariantContextBuilder builder = new VariantContextBuilder("test", "20", start, start+length-1, alleles);
|
final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings);
|
||||||
final VariantContext unclipped = builder.make();
|
|
||||||
final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true);
|
final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true);
|
||||||
|
|
||||||
Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped);
|
Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped);
|
||||||
for ( int i = 0; i < alleles.size(); i++ ) {
|
for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) {
|
||||||
final Allele trimmed = clipped.getAlleles().get(i);
|
final Allele trimmed = clipped.getAlleles().get(i);
|
||||||
Assert.assertEquals(trimmed.getBaseString(), expected.get(i));
|
Assert.assertEquals(trimmed.getBaseString(), expected.get(i));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue