Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
65c614fb4b
|
|
@ -215,7 +215,7 @@
|
|||
|
||||
<target name="git.describe">
|
||||
<exec executable="git" outputproperty="git.describe.output" resultproperty="git.describe.exit.value" failonerror="false">
|
||||
<arg line="describe" />
|
||||
<arg line="describe --long" />
|
||||
</exec>
|
||||
<condition property="git.describe.succeeded">
|
||||
<equals arg1="${git.describe.exit.value}" arg2="0" />
|
||||
|
|
|
|||
2
ivy.xml
2
ivy.xml
|
|
@ -76,7 +76,7 @@
|
|||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
||||
|
||||
<!-- snpEff annotator for pipelines -->
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.4rc3" />
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.5" />
|
||||
|
||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||
<exclude org="javax.servlet" />
|
||||
|
|
|
|||
|
|
@ -443,7 +443,7 @@ public class GenomeAnalysisEngine {
|
|||
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
||||
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
|
||||
|
||||
if(walker instanceof LocusWalker) {
|
||||
if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) {
|
||||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
if(intervals == null)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
||||
|
||||
import java.util.Collection;
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
|||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
|
@ -55,7 +56,6 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
|
||||
traversalEngine.startTimersIfNecessary();
|
||||
if(shard.getShardType() == Shard.ShardType.LOCUS) {
|
||||
LocusWalker lWalker = (LocusWalker)walker;
|
||||
WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
|
||||
getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
|
||||
for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
|
||||
|
|
@ -77,6 +77,12 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
done = walker.isDone();
|
||||
}
|
||||
|
||||
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
if( traversalEngine instanceof TraverseActiveRegions ) {
|
||||
final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
|
||||
accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator
|
||||
}
|
||||
|
||||
Object result = accumulator.finishTraversal();
|
||||
|
||||
printOnTraversalDone(result);
|
||||
|
|
|
|||
|
|
@ -128,6 +128,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
traversalEngine = new TraverseDuplicates();
|
||||
} else if (walker instanceof ReadPairWalker) {
|
||||
traversalEngine = new TraverseReadPairs();
|
||||
} else if (walker instanceof ActiveRegionWalker) {
|
||||
traversalEngine = new TraverseActiveRegions();
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,213 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Queue;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 12/9/11
|
||||
*/
|
||||
|
||||
public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
private final Queue<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
|
||||
private final LinkedHashSet<SAMRecord> myReads = new LinkedHashSet<SAMRecord>();
|
||||
|
||||
@Override
|
||||
protected String getTraversalType() {
|
||||
return "active regions";
|
||||
}
|
||||
|
||||
@Override
|
||||
public T traverse( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
||||
|
||||
LocusView locusView = getLocusView( walker, dataProvider );
|
||||
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
|
||||
final ArrayList<ActiveRegion> isActiveList = new ArrayList<ActiveRegion>();
|
||||
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
GenomeLoc location = locus.getLocation();
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
if ( locus.hasExtendedEventPileup() ) {
|
||||
// if the alignment context we received holds an "extended" pileup (i.e. pileup of insertions/deletions
|
||||
// associated with the current site), we need to update the location. The updated location still starts
|
||||
// at the current genomic position, but it has to span the length of the longest deletion (if any).
|
||||
location = engine.getGenomeLocParser().setStop(location,location.getStop()+locus.getExtendedEventPileup().getMaxDeletionLength());
|
||||
|
||||
// it is possible that the new expanded location spans the current shard boundary; the next method ensures
|
||||
// that when it is the case, the reference sequence held by the ReferenceView will be reloaded so that
|
||||
// the view has all the bases we are gonna need. If the location fits within the current view bounds,
|
||||
// the next call will not do anything to the view:
|
||||
referenceView.expandBoundsToAccomodateLoc(location);
|
||||
}
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
final boolean isActive = walker.isActive( tracker, refContext, locus );
|
||||
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser()) );
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final SAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard then need to calculate the minimum alignment start so that
|
||||
// we know which active regions in the work queue are now safe to process
|
||||
if( !locusView.hasNext() ) {
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final SAMRecord read = p.getRead();
|
||||
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
|
||||
}
|
||||
}
|
||||
printProgress(dataProvider.getShard(),locus.getLocation());
|
||||
}
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList );
|
||||
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
while( workQueue.peek().getLocation().getStop() < minStart ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
public T endTraversal( final Walker<M,T> walker, T sum) {
|
||||
while( workQueue.peek() != null ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker<M,T>) walker );
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet<SAMRecord> reads, final Queue<ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
final ArrayList<SAMRecord> placedReads = new ArrayList<SAMRecord>();
|
||||
for( final SAMRecord read : reads ) {
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
|
||||
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
|
||||
ActiveRegion bestRegion = activeRegion;
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
|
||||
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap(readLoc);
|
||||
bestRegion = otherRegionToTest;
|
||||
}
|
||||
}
|
||||
bestRegion.add( (GATKSAMRecord) read, true );
|
||||
|
||||
// The read is also added to all other region in which it overlaps but marked as non-primary
|
||||
if( !bestRegion.equals(activeRegion) ) {
|
||||
activeRegion.add( (GATKSAMRecord) read, false );
|
||||
}
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
|
||||
activeRegion.add( (GATKSAMRecord) read, false );
|
||||
}
|
||||
}
|
||||
placedReads.add( read );
|
||||
}
|
||||
}
|
||||
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation());
|
||||
final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker
|
||||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
|
||||
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
||||
// integrate active regions into contiguous chunks based on active status
|
||||
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<ActiveRegion> activeList ) {
|
||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||
ActiveRegion prevLocus = activeList.remove(0);
|
||||
ActiveRegion startLocus = prevLocus;
|
||||
for( final ActiveRegion thisLocus : activeList ) {
|
||||
if( prevLocus.isActive != thisLocus.isActive ) {
|
||||
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
|
||||
prevLocus.isActive, engine.getGenomeLocParser() ) );
|
||||
startLocus = thisLocus;
|
||||
}
|
||||
prevLocus = thisLocus;
|
||||
}
|
||||
// output the last region if necessary
|
||||
if( startLocus != prevLocus ) {
|
||||
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
|
||||
prevLocus.isActive, engine.getGenomeLocParser() ) );
|
||||
}
|
||||
return returnList;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 12/7/11
|
||||
*/
|
||||
|
||||
@By(DataSource.READS)
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
|
||||
@PartitionBy(PartitionType.READ)
|
||||
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
// Determine active status over the AlignmentContext
|
||||
public abstract boolean isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
|
||||
|
||||
// Map over the ActiveRegion
|
||||
public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker);
|
||||
}
|
||||
|
|
@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
// We refuse to parse SnpEff output files generated by unsupported versions, or
|
||||
// lacking a SnpEff version number in the VCF header:
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" };
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.5" };
|
||||
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
|
||||
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
|
||||
|
||||
|
|
@ -204,11 +204,6 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
}
|
||||
|
||||
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
throw new UserException("SnpEff support is currently disabled in the GATK until SnpEff 2.0.4 is officially released " +
|
||||
"due to a serious issue with SnpEff versions prior to 2.0.4. Please see this page for more details: " +
|
||||
"http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator");
|
||||
|
||||
/*
|
||||
// Make sure that we actually have a valid SnpEff rod binding (just in case the user specified -A SnpEff
|
||||
// without providing a SnpEff rod via --snpEffFile):
|
||||
validateRodBinding(walker.getSnpEffRodBinding());
|
||||
|
|
@ -228,7 +223,6 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
// mistaken in the future for a SnpEff output file:
|
||||
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_VERSION_LINE_KEY, snpEffVersionLine.getValue()));
|
||||
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue()));
|
||||
*/
|
||||
}
|
||||
|
||||
public Map<String, Object> annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
|
||||
|
|
|
|||
|
|
@ -275,19 +275,22 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
return 0;
|
||||
|
||||
if ( elt.isReducedRead() ) {
|
||||
// reduced read representation
|
||||
if ( BaseUtils.isRegularBase( obsBase )) {
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getRepresentativeCount(); // we added nObs bases here
|
||||
int representativeCount = elt.getRepresentativeCount();
|
||||
add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods
|
||||
return representativeCount; // we added nObs bases here
|
||||
}
|
||||
|
||||
// odd bases or deletions => don't use them
|
||||
return 0;
|
||||
}
|
||||
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
return add(obsBase, qual, (byte)0, (byte)0, 1);
|
||||
}
|
||||
|
||||
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
|
|
@ -519,7 +522,7 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
qual = (byte)Math.min((int)qual, p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -72,25 +73,28 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
this.logger = logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup);
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
*
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @param locParser Genome Loc Parser
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup,
|
||||
GenomeLocParser locParser);
|
||||
|
||||
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
|
|
@ -54,17 +55,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
private final boolean getAlleleListFromVCF;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
|
||||
private final boolean doMultiAllelicCalls;
|
||||
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
|
||||
|
||||
private final int maxAlternateAlleles;
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
|
||||
private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() {
|
||||
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
|
||||
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||
}
|
||||
};
|
||||
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
|
||||
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
|
||||
}
|
||||
};
|
||||
|
||||
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
|
||||
|
||||
|
|
@ -81,12 +82,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
alleleList = new ArrayList<Allele>();
|
||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
maxAlternateAlleles = UAC.MAX_ALTERNATE_ALLELES;
|
||||
doMultiAllelicCalls = UAC.MULTI_ALLELIC;
|
||||
|
||||
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
|
||||
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
|
||||
|
|
@ -95,7 +98,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
private ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType) {
|
||||
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser) {
|
||||
Allele refAllele=null, altAllele=null;
|
||||
GenomeLoc loc = ref.getLocus();
|
||||
ArrayList<Allele> aList = new ArrayList<Allele>();
|
||||
|
|
@ -114,7 +117,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
|
||||
return aList;
|
||||
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// todo -- warning, can be duplicating expensive partition here
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
|
@ -126,9 +129,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
|
||||
//SAMRecord read = p.getRead();
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
continue;
|
||||
continue;
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -208,63 +211,69 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
}
|
||||
|
||||
/* if (DEBUG) {
|
||||
int icount = indelPileup.getNumberOfInsertions();
|
||||
int dcount = indelPileup.getNumberOfDeletions();
|
||||
if (icount + dcount > 0)
|
||||
{
|
||||
List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
|
||||
System.out.format("#ins: %d, #del:%d\n", insCount, delCount);
|
||||
|
||||
for (int i=0 ; i < eventStrings.size() ; i++ ) {
|
||||
System.out.format("%s:%d,",eventStrings.get(i).first,eventStrings.get(i).second);
|
||||
// int k=0;
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
} */
|
||||
}
|
||||
|
||||
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
|
||||
int maxAlleleCnt = 0;
|
||||
String bestAltAllele = "";
|
||||
|
||||
for (String s : consensusIndelStrings.keySet()) {
|
||||
int curCnt = consensusIndelStrings.get(s);
|
||||
if (curCnt > maxAlleleCnt) {
|
||||
maxAlleleCnt = curCnt;
|
||||
bestAltAllele = s;
|
||||
int curCnt = consensusIndelStrings.get(s), stop = 0;
|
||||
// if observed count if above minimum threshold, we will genotype this allele
|
||||
if (curCnt < minIndelCountForGenotyping)
|
||||
continue;
|
||||
|
||||
if (s.startsWith("D")) {
|
||||
// get deletion length
|
||||
int dLen = Integer.valueOf(s.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
|
||||
stop = loc.getStart() + dLen;
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
|
||||
|
||||
if (Allele.acceptableAlleleBases(refBases)) {
|
||||
refAllele = Allele.create(refBases,true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(s)) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(s, false);
|
||||
stop = loc.getStart();
|
||||
}
|
||||
}
|
||||
// if (DEBUG)
|
||||
// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) );
|
||||
} //gdebug-
|
||||
|
||||
if (maxAlleleCnt < minIndelCountForGenotyping)
|
||||
return aList;
|
||||
|
||||
if (bestAltAllele.startsWith("D")) {
|
||||
// get deletion length
|
||||
int dLen = Integer.valueOf(bestAltAllele.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
|
||||
ArrayList vcAlleles = new ArrayList<Allele>();
|
||||
vcAlleles.add(refAllele);
|
||||
vcAlleles.add(altAllele);
|
||||
|
||||
//System.out.println(new String(ref.getBases()));
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder().source("");
|
||||
builder.loc(loc.getContig(), loc.getStart(), stop);
|
||||
builder.alleles(vcAlleles);
|
||||
builder.referenceBaseForIndel(ref.getBase());
|
||||
builder.noGenotypes();
|
||||
if (doMultiAllelicCalls)
|
||||
vcs.add(builder.make());
|
||||
else {
|
||||
if (curCnt > maxAlleleCnt) {
|
||||
maxAlleleCnt = curCnt;
|
||||
vcs.clear();
|
||||
vcs.add(builder.make());
|
||||
}
|
||||
|
||||
if (Allele.acceptableAlleleBases(refBases)) {
|
||||
refAllele = Allele.create(refBases,true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(bestAltAllele)) {
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(bestAltAllele, false);
|
||||
}
|
||||
}
|
||||
if (refAllele != null && altAllele != null) {
|
||||
aList.add(0,refAllele);
|
||||
aList.add(1,altAllele);
|
||||
}
|
||||
|
||||
if (vcs.isEmpty())
|
||||
return aList; // nothing else to do, no alleles passed minimum count criterion
|
||||
|
||||
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
|
||||
|
||||
aList = new ArrayList<Allele>(mergedVC.getAlleles());
|
||||
|
||||
return aList;
|
||||
|
||||
}
|
||||
|
|
@ -277,7 +286,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
boolean useBAQedPileup, GenomeLocParser locParser) {
|
||||
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
|
@ -294,17 +303,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
haplotypeMap.clear();
|
||||
|
||||
if (getAlleleListFromVCF) {
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
vc = vc_input;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) {
|
||||
if( vc_input != null &&
|
||||
allowableTypes.contains(vc_input.getType()) &&
|
||||
ref.getLocus().getStart() == vc_input.getStart()) {
|
||||
vc = vc_input;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
alleleList.clear();
|
||||
if (ignoreSNPAllelesWhenGenotypingIndels) {
|
||||
|
|
@ -323,7 +332,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
}
|
||||
else {
|
||||
alleleList = computeConsensusAlleles(ref,contexts, contextType);
|
||||
alleleList = computeConsensusAlleles(ref,contexts, contextType, locParser);
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
}
|
||||
|
|
@ -340,7 +349,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
|
||||
|
||||
refAllele = alleleList.get(0);
|
||||
altAllele = alleleList.get(1);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,10 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -46,8 +43,6 @@ import java.util.*;
|
|||
|
||||
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50;
|
||||
|
||||
private boolean ALLOW_MULTIPLE_ALLELES;
|
||||
|
||||
private final boolean useAlleleFromVCF;
|
||||
|
|
@ -56,15 +51,20 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
super(UAC, logger);
|
||||
ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC;
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
|
||||
// make sure the PL cache has been initialized with enough alleles
|
||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles
|
||||
UnifiedGenotyperEngine.calculatePLcache(3);
|
||||
}
|
||||
|
||||
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenotypePriors priors,
|
||||
final Allele alternateAlleleToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
if ( !(priors instanceof DiploidSNPGenotypePriors) )
|
||||
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
|
||||
|
|
@ -79,6 +79,20 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
alleles.add(Allele.create(refBase, true));
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
|
||||
|
||||
// calculate the GLs
|
||||
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
|
||||
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
if ( nGoodBases > 0 )
|
||||
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
|
||||
}
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
if ( alternateAlleleToUse != null ) {
|
||||
basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true;
|
||||
|
|
@ -93,7 +107,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true;
|
||||
} else {
|
||||
|
||||
determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup);
|
||||
determineAlternateAlleles(basesToUse, refBase, GLs);
|
||||
|
||||
// how many alternate alleles are we using?
|
||||
int alleleCounter = Utils.countSetBits(basesToUse);
|
||||
|
|
@ -125,22 +139,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
builder.alleles(alleles);
|
||||
|
||||
// create the genotypes; no-call everyone for now
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
|
||||
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
if ( nGoodBases == 0 )
|
||||
continue;
|
||||
|
||||
final double[] allLikelihoods = GL.getLikelihoods();
|
||||
for ( SampleGenotypeData sampleData : GLs ) {
|
||||
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
||||
final double[] myLikelihoods = new double[numLikelihoods];
|
||||
|
||||
int myLikelihoodsIndex = 0;
|
||||
|
|
@ -151,60 +155,48 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
}
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth);
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
}
|
||||
|
||||
return builder.genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
// fills in the allelesToUse array
|
||||
protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
|
||||
int[] qualCounts = new int[4];
|
||||
protected void determineAlternateAlleles(final boolean[] allelesToUse, final byte ref, final List<SampleGenotypeData> sampleDataList) {
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// calculate the sum of quality scores for each base
|
||||
ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
// ignore deletions
|
||||
if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) )
|
||||
continue;
|
||||
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
|
||||
final double[] likelihoodCounts = new double[4];
|
||||
|
||||
final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
|
||||
if ( index >= 0 ) {
|
||||
qualCounts[index] += p.getQual();
|
||||
}
|
||||
// based on the GLs, find the alternate alleles with the most probability
|
||||
for ( SampleGenotypeData sampleData : sampleDataList ) {
|
||||
final double[] likelihoods = sampleData.GL.getLikelihoods();
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PLindexOfRef ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
|
||||
if ( alleles[0] != baseIndexOfRef )
|
||||
likelihoodCounts[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
|
||||
likelihoodCounts[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
}
|
||||
}
|
||||
|
||||
if ( ALLOW_MULTIPLE_ALLELES ) {
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) {
|
||||
allelesToUse[index] = true;
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
if ( likelihoodCounts[i] > 0.0 ) {
|
||||
allelesToUse[i] = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// set the non-ref base which has the maximum quality score sum
|
||||
int maxCount = 0;
|
||||
int indexOfMax = 0;
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( qualCounts[index] > maxCount ) {
|
||||
maxCount = qualCounts[index];
|
||||
indexOfMax = index;
|
||||
}
|
||||
}
|
||||
|
||||
if ( maxCount > 0 )
|
||||
// set the non-ref base which has the maximum sum of non-ref GLs
|
||||
final int indexOfMax = MathUtils.maxElementIndex(likelihoodCounts);
|
||||
if ( likelihoodCounts[indexOfMax] > 0.0 )
|
||||
allelesToUse[indexOfMax] = true;
|
||||
}
|
||||
}
|
||||
|
|
@ -227,4 +219,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
}
|
||||
private static class SampleGenotypeData {
|
||||
|
||||
public final String name;
|
||||
public final DiploidSNPGenotypeLikelihoods GL;
|
||||
public final int depth;
|
||||
|
||||
public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) {
|
||||
this.name = name;
|
||||
this.GL = GL;
|
||||
this.depth = depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ public class UnifiedArgumentCollection {
|
|||
* For advanced users only.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles (SNPs only)", required = false)
|
||||
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow the discovery of multiple alleles", required = false)
|
||||
public boolean MULTI_ALLELIC = false;
|
||||
|
||||
/**
|
||||
|
|
@ -146,8 +146,8 @@ public class UnifiedArgumentCollection {
|
|||
public int INDEL_HAPLOTYPE_SIZE = 80;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false)
|
||||
public boolean BANDED_INDEL_COMPUTATION = false;
|
||||
@Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false)
|
||||
public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
|
|
@ -184,7 +184,7 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION;
|
||||
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
|
||||
uac.MULTI_ALLELIC = MULTI_ALLELIC;
|
||||
return uac;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -243,7 +243,7 @@ public class UnifiedGenotyperEngine {
|
|||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
}
|
||||
|
||||
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
|
||||
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
|
||||
}
|
||||
|
||||
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
|
||||
|
|
@ -858,4 +858,171 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
return calls;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use
|
||||
* @param exactAC integer array describing the AC from the exact model for the corresponding alleles
|
||||
* @return genotypes
|
||||
*/
|
||||
public static GenotypesContext constrainedAssignGenotypes(VariantContext vc, boolean[] allelesToUse, int[] exactAC ) {
|
||||
|
||||
final GenotypesContext GLs = vc.getGenotypes();
|
||||
|
||||
// samples
|
||||
final List<String> sampleIndices = GLs.getSampleNamesOrderedByName();
|
||||
|
||||
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
|
||||
final int numOriginalAltAlleles = allelesToUse.length;
|
||||
final List<Allele> newAlleles = new ArrayList<Allele>(numOriginalAltAlleles+1);
|
||||
newAlleles.add(vc.getReference());
|
||||
final HashMap<Allele,Integer> alleleIndexMap = new HashMap<Allele,Integer>(); // need this for skipping dimensions
|
||||
int[] alleleCount = new int[exactAC.length];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
|
||||
if ( allelesToUse[i] ) {
|
||||
newAlleles.add(vc.getAlternateAllele(i));
|
||||
alleleIndexMap.put(vc.getAlternateAllele(i),i);
|
||||
alleleCount[i] = exactAC[i];
|
||||
} else {
|
||||
alleleCount[i] = 0;
|
||||
}
|
||||
}
|
||||
final List<Allele> newAltAlleles = newAlleles.subList(1,newAlleles.size());
|
||||
final int numNewAltAlleles = newAltAlleles.size();
|
||||
ArrayList<Integer> likelihoodIndexesToUse = null;
|
||||
|
||||
// an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles,
|
||||
// then we can keep the PLs as is; otherwise, we determine which ones to keep
|
||||
final int[][] PLcache;
|
||||
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
|
||||
likelihoodIndexesToUse = new ArrayList<Integer>(30);
|
||||
PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
|
||||
|
||||
for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) {
|
||||
int[] alleles = PLcache[PLindex];
|
||||
// consider this entry only if both of the alleles are good
|
||||
if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) )
|
||||
likelihoodIndexesToUse.add(PLindex);
|
||||
}
|
||||
} else {
|
||||
PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
|
||||
}
|
||||
|
||||
// set up the trellis dimensions
|
||||
// SAMPLE x alt 1 x alt 2 x alt 3
|
||||
// todo -- check that exactAC has alt counts at [1],[2],[3] (and not [0],[1],[2])
|
||||
double[][][][] transitionTrellis = new double[sampleIndices.size()+1][exactAC[1]][exactAC[2]][exactAC[3]];
|
||||
// N x AC1 x AC2 x AC3; worst performance in multi-allelic where all alleles are moderate frequency
|
||||
// capped at the MLE ACs*
|
||||
// todo -- there's an optimization: not all states in the rectangular matrix will be reached, in fact
|
||||
// todo -- for tT[0] we only care about tT[0][0][0][0], and for tT[1], only combinations of 0,1,2.
|
||||
int idx = 1; // index of which sample we're on
|
||||
int prevMaxState = 0; // the maximum state (e.g. AC) reached by the previous sample. Symmetric. (AC capping handled by logic in loop)
|
||||
// iterate over each sample
|
||||
for ( String sample : sampleIndices ) {
|
||||
// push the likelihoods into the next possible states, that is to say
|
||||
// L[state] = L[prev state] + L[genotype getting into state]
|
||||
// iterate over each previous state, by dimension
|
||||
// and contribute the likelihoods for transitions to this state
|
||||
double[][][] prevState = transitionTrellis[idx-1];
|
||||
double[][][] thisState = transitionTrellis[idx];
|
||||
Genotype genotype = GLs.get(sample);
|
||||
if ( genotype.isNoCall() || genotype.isFiltered() ) {
|
||||
thisState = prevState.clone();
|
||||
} else {
|
||||
double[] likelihoods = genotype.getLikelihoods().getAsVector();
|
||||
int dim1min = Math.max(0, alleleCount[0]-2*(sampleIndices.size()-idx+1));
|
||||
int dim1max = Math.min(prevMaxState,alleleCount[0]);
|
||||
int dim2min = Math.max(0,alleleCount[1]-2*(sampleIndices.size()-idx+1));
|
||||
int dim2max = Math.min(prevMaxState,alleleCount[1]);
|
||||
int dim3min = Math.max(0,alleleCount[2]-2*(sampleIndices.size()-idx+1));
|
||||
int dim3max = Math.min(prevMaxState,alleleCount[2]);
|
||||
// cue annoying nested for loop
|
||||
for ( int a1 = dim1min ; a1 <= dim1max; a1++ ) {
|
||||
for ( int a2 = dim2min; a2 <= dim2max; a2++ ) {
|
||||
for ( int a3 = dim3min; a3 <= dim3max; a3++ ) {
|
||||
double base = prevState[a1][a2][a3];
|
||||
for ( int likIdx : likelihoodIndexesToUse ) {
|
||||
int[] offsets = calculateOffsets(PLcache[likIdx]);
|
||||
thisState[a1+offsets[1]][a2+offsets[2]][a3+offsets[3]] = base + likelihoods[likIdx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prevMaxState += 2;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
// after all that pain, we have a fully calculated trellis. Now just march backwards from the EAC state and
|
||||
// assign genotypes along the greedy path
|
||||
|
||||
GenotypesContext calls = GenotypesContext.create(sampleIndices.size());
|
||||
int[] state = alleleCount;
|
||||
for ( String sample : Utils.reverse(sampleIndices) ) {
|
||||
--idx;
|
||||
// the next state will be the maximum achievable state
|
||||
Genotype g = GLs.get(sample);
|
||||
if ( g.isNoCall() || ! g.hasLikelihoods() ) {
|
||||
calls.add(g);
|
||||
continue;
|
||||
}
|
||||
|
||||
// subset to the new likelihoods. These are not used except for subsetting in the context iself.
|
||||
// i.e. they are not a part of the calculation.
|
||||
final double[] originalLikelihoods = GLs.get(sample).getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( likelihoodIndexesToUse == null ) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = new double[likelihoodIndexesToUse.size()];
|
||||
int newIndex = 0;
|
||||
for ( int oldIndex : likelihoodIndexesToUse )
|
||||
newLikelihoods[newIndex++] = originalLikelihoods[oldIndex];
|
||||
|
||||
// might need to re-normalize
|
||||
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
|
||||
}
|
||||
|
||||
// todo -- alter this. For ease of programming, likelihood indeces are
|
||||
// todo -- used to iterate over achievable states.
|
||||
double max = Double.NEGATIVE_INFINITY;
|
||||
int[] bestState = null;
|
||||
int[] bestAlleles = null;
|
||||
int bestLikIdx = -1;
|
||||
for ( int likIdx : likelihoodIndexesToUse ) {
|
||||
int[] offsets = calculateOffsets(PLcache[likIdx]);
|
||||
double val = transitionTrellis[idx-1][state[0]-offsets[0]][state[1]-offsets[1]][state[2]-offsets[2]];
|
||||
if ( val > max ) {
|
||||
max = val;
|
||||
bestState = new int[] { state[0]-offsets[0],state[1]-offsets[1],state[2]-offsets[2]};
|
||||
bestAlleles = PLcache[likIdx];
|
||||
bestLikIdx = likIdx;
|
||||
}
|
||||
}
|
||||
state = bestState;
|
||||
List<Allele> gtAlleles = new ArrayList<Allele>(2);
|
||||
gtAlleles.add(newAlleles.get(bestAlleles[0]));
|
||||
gtAlleles.add(newAlleles.get(bestAlleles[1]));
|
||||
|
||||
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(bestLikIdx, newLikelihoods);
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
|
||||
if ( numNewAltAlleles == 0 )
|
||||
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
|
||||
else
|
||||
attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods));
|
||||
calls.add(new Genotype(sample, gtAlleles, qual, null, attrs, false));
|
||||
|
||||
}
|
||||
return calls;
|
||||
}
|
||||
|
||||
private static int[] calculateOffsets(int[] alleleIndeces) {
|
||||
int[] offsets = new int[4];
|
||||
for ( int i = 0; i < alleleIndeces.length; i++ ) {
|
||||
offsets[alleleIndeces[i]]++;
|
||||
}
|
||||
|
||||
return offsets;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,13 +28,13 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
|
|
@ -409,9 +409,9 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
else {
|
||||
//System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
|
||||
SAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
// System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
|
|
|
|||
|
|
@ -106,37 +106,70 @@ public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
|
|||
POLY_BASED_ON_GL
|
||||
}
|
||||
|
||||
/**
|
||||
* The input VCF file
|
||||
*/
|
||||
@Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true)
|
||||
public List<RodBinding<VariantContext>> variants;
|
||||
|
||||
/**
|
||||
* The output VCF file
|
||||
*/
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfWriter = null;
|
||||
|
||||
/**
|
||||
* Sample name(s) to subset the input VCF to, prior to selecting variants. -sn A -sn B subsets to samples A and B.
|
||||
*/
|
||||
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
|
||||
public Set<String> sampleNames = new HashSet<String>(0);
|
||||
|
||||
/**
|
||||
* Sample regexps to subset the input VCF to, prior to selecting variants. -sn NA12* subsets to all samples with prefix NA12
|
||||
*/
|
||||
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
|
||||
public Set<String> sampleExpressions ;
|
||||
|
||||
/**
|
||||
* File containing a list of sample names to subset the input vcf to. Equivalent to specifying the contents of the file separately with -sn
|
||||
*/
|
||||
@Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
|
||||
public Set<File> sampleFiles;
|
||||
|
||||
/**
|
||||
* A mode for selecting sites based on sample-level data. See the wiki documentation for more information.
|
||||
*/
|
||||
@Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false)
|
||||
private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE;
|
||||
|
||||
/**
|
||||
* An P[nonref] threshold for SAMPLE_SELECTION_MODE=POLY_BASED_ON_GL. See the wiki documentation for more information.
|
||||
*/
|
||||
@Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" +
|
||||
" that a site is non-reference in the samples for which to include the site",required=false)
|
||||
private double samplePNonref = 0.99;
|
||||
|
||||
/**
|
||||
* The number of sites in your validation set
|
||||
*/
|
||||
@Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true)
|
||||
private int numValidationSites;
|
||||
|
||||
/**
|
||||
* Do not exclude filtered sites (e.g. not PASS or .) from consideration for validation
|
||||
*/
|
||||
@Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false)
|
||||
private boolean INCLUDE_FILTERED_SITES = false;
|
||||
|
||||
/**
|
||||
* Argument for the frequency selection mode. (AC/AF/AN) are taken from VCF info field, not recalculated. Typically specified for sites-only VCFs that still have AC/AF/AN information.
|
||||
*/
|
||||
@Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false)
|
||||
private boolean IGNORE_GENOTYPES = false;
|
||||
|
||||
/**
|
||||
* Argument for the frequency selection mode. Allows reference (non-polymorphic) sites to be included in the validation set.
|
||||
*/
|
||||
@Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false)
|
||||
private boolean IGNORE_POLYMORPHIC = false;
|
||||
|
||||
|
|
@ -145,19 +178,14 @@ public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
|
|||
private int numFrequencyBins = 20;
|
||||
|
||||
/**
|
||||
* This argument selects allele frequency selection mode:
|
||||
* KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set
|
||||
* UNIFORM will choose variants uniformly without regard to their allele frequency.
|
||||
*
|
||||
*/
|
||||
* This argument selects allele frequency selection mode. See the wiki for more information.
|
||||
*/
|
||||
@Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false)
|
||||
private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM;
|
||||
|
||||
/**
|
||||
* This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
|
||||
* When specified one or more times, a particular type of variant is selected.
|
||||
*
|
||||
*/
|
||||
* This argument selects particular kinds of variants (i.e. SNP, INDEL) out of a list. If left unspecified, all types are considered.
|
||||
*/
|
||||
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
|
||||
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();
|
||||
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ import java.util.*;
|
|||
public class VariantSummary extends VariantEvaluator implements StandardEval {
|
||||
final protected static Logger logger = Logger.getLogger(VariantSummary.class);
|
||||
|
||||
/** Indels with size greater than this value are tallied in the CNV column */
|
||||
private final static int MAX_INDEL_LENGTH = 50;
|
||||
private final static double MIN_CNV_OVERLAP = 0.5;
|
||||
private VariantEvalWalker walker;
|
||||
|
|
@ -196,14 +197,16 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
|
|||
}
|
||||
|
||||
private final boolean overlapsKnownCNV(VariantContext cnv) {
|
||||
final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true);
|
||||
IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig());
|
||||
if ( knownCNVs != null ) {
|
||||
final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true);
|
||||
IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig());
|
||||
|
||||
final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop());
|
||||
while ( nodeIt.hasNext() ) {
|
||||
final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue());
|
||||
if ( overlapP > MIN_CNV_OVERLAP )
|
||||
return true;
|
||||
final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop());
|
||||
while ( nodeIt.hasNext() ) {
|
||||
final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue());
|
||||
if ( overlapP > MIN_CNV_OVERLAP )
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
@ -224,7 +227,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
|
|||
allVariantCounts.inc(type, ALL);
|
||||
|
||||
// type specific calculations
|
||||
if ( type == Type.SNP ) {
|
||||
if ( type == Type.SNP && eval.isBiallelic() ) {
|
||||
titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample;
|
||||
titvTable.inc(type, ALL);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
}
|
||||
|
||||
return new GenomeLoc(getContig(), this.contigIndex,
|
||||
Math.min(getStart(), that.getStart()),
|
||||
Math.min( getStart(), that.getStart() ),
|
||||
Math.max( getStop(), that.getStop()) );
|
||||
}
|
||||
|
||||
|
|
@ -465,4 +465,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
private final static double overlapPercent(final GenomeLoc gl1, final GenomeLoc gl2) {
|
||||
return (1.0 * gl1.intersect(gl2).size()) / gl1.size();
|
||||
}
|
||||
|
||||
public long sizeOfOverlap( final GenomeLoc that ) {
|
||||
return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,19 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 1/4/12
|
||||
*/
|
||||
|
||||
public class ActiveRead {
|
||||
final public GATKSAMRecord read;
|
||||
final public boolean isPrimaryRegion;
|
||||
|
||||
ActiveRead( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
|
||||
this.read = read;
|
||||
this.isPrimaryRegion = isPrimaryRegion;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 1/4/12
|
||||
*/
|
||||
|
||||
public class ActiveRegion implements HasGenomeLocation {
|
||||
|
||||
private final ArrayList<ActiveRead> reads = new ArrayList<ActiveRead>();
|
||||
private byte[] reference = null;
|
||||
private final GenomeLoc loc;
|
||||
private GenomeLoc referenceLoc = null;
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
public final boolean isActive;
|
||||
|
||||
public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) {
|
||||
this.loc = loc;
|
||||
this.isActive = isActive;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
referenceLoc = loc;
|
||||
}
|
||||
|
||||
// add each read to the bin and extend the reference genome loc if needed
|
||||
public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
|
||||
referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
|
||||
reads.add( new ActiveRead(read, isPrimaryRegion) );
|
||||
}
|
||||
|
||||
public ArrayList<ActiveRead> getReads() { return reads; }
|
||||
|
||||
public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) {
|
||||
// set up the reference if we haven't done so yet
|
||||
if ( reference == null ) {
|
||||
reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases();
|
||||
}
|
||||
|
||||
return reference;
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() { return loc; }
|
||||
|
||||
public GenomeLoc getReferenceLocation() { return referenceLoc; }
|
||||
|
||||
public int size() { return reads.size(); }
|
||||
}
|
||||
|
|
@ -70,27 +70,27 @@ public class ClippingOp {
|
|||
break;
|
||||
|
||||
case SOFTCLIP_BASES:
|
||||
if ( read.getReadUnmappedFlag() ) {
|
||||
if (read.getReadUnmappedFlag()) {
|
||||
// we can't process unmapped reads
|
||||
throw new UserException("Read Clipper cannot soft clip unmapped reads");
|
||||
}
|
||||
|
||||
//System.out.printf("%d %d %d%n", stop, start, read.getReadLength());
|
||||
int myStop = stop;
|
||||
if ( (stop + 1 - start) == read.getReadLength() ) {
|
||||
if ((stop + 1 - start) == read.getReadLength()) {
|
||||
// BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone
|
||||
//Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName()));
|
||||
//break;
|
||||
myStop--; // just decrement stop
|
||||
}
|
||||
|
||||
if ( start > 0 && myStop != read.getReadLength() - 1 )
|
||||
if (start > 0 && myStop != read.getReadLength() - 1)
|
||||
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop));
|
||||
|
||||
Cigar oldCigar = read.getCigar();
|
||||
|
||||
int scLeft = 0, scRight = read.getReadLength();
|
||||
if ( start == 0 )
|
||||
if (start == 0)
|
||||
scLeft = myStop + 1;
|
||||
else
|
||||
scRight = start;
|
||||
|
|
@ -134,8 +134,7 @@ public class ClippingOp {
|
|||
unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH));
|
||||
matchesCount = 0;
|
||||
unclippedCigar.add(element);
|
||||
}
|
||||
else
|
||||
} else
|
||||
unclippedCigar.add(element);
|
||||
}
|
||||
if (matchesCount > 0)
|
||||
|
|
@ -284,10 +283,9 @@ public class ClippingOp {
|
|||
}
|
||||
|
||||
@Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"})
|
||||
private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) {
|
||||
private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) {
|
||||
if (start == 0 && stop == read.getReadLength() - 1)
|
||||
return GATKSAMRecord.emptyRead(read);
|
||||
// return new GATKSAMRecord(read.getHeader());
|
||||
|
||||
|
||||
// If the read is unmapped there is no Cigar string and neither should we create a new cigar string
|
||||
|
|
@ -296,8 +294,8 @@ public class ClippingOp {
|
|||
// the cigar may force a shift left or right (or both) in case we are left with insertions
|
||||
// starting or ending the read after applying the hard clip on start/stop.
|
||||
int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd;
|
||||
byte [] newBases = new byte[newLength];
|
||||
byte [] newQuals = new byte[newLength];
|
||||
byte[] newBases = new byte[newLength];
|
||||
byte[] newQuals = new byte[newLength];
|
||||
int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart;
|
||||
|
||||
System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength);
|
||||
|
|
@ -321,11 +319,11 @@ public class ClippingOp {
|
|||
}
|
||||
|
||||
@Requires({"!cigar.isEmpty()"})
|
||||
private CigarShift hardClipCigar (Cigar cigar, int start, int stop) {
|
||||
private CigarShift hardClipCigar(Cigar cigar, int start, int stop) {
|
||||
Cigar newCigar = new Cigar();
|
||||
int index = 0;
|
||||
int totalHardClipCount = stop - start + 1;
|
||||
int alignmentShift = 0; // caused by hard clipping insertions or deletions
|
||||
int alignmentShift = 0; // caused by hard clipping deletions
|
||||
|
||||
// hard clip the beginning of the cigar string
|
||||
if (start == 0) {
|
||||
|
|
@ -353,7 +351,7 @@ public class ClippingOp {
|
|||
// element goes beyond what we need to clip
|
||||
else if (index + shift > stop + 1) {
|
||||
int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1);
|
||||
alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop-index+1);
|
||||
alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1);
|
||||
newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP));
|
||||
newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
|
||||
}
|
||||
|
|
@ -388,7 +386,7 @@ public class ClippingOp {
|
|||
if (index + shift < start)
|
||||
newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator()));
|
||||
|
||||
// element goes beyond our clip starting position
|
||||
// element goes beyond our clip starting position
|
||||
else {
|
||||
int elementLengthAfterChopping = start - index;
|
||||
alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index));
|
||||
|
|
@ -396,7 +394,7 @@ public class ClippingOp {
|
|||
// if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later
|
||||
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
|
||||
totalHardClipCount += elementLengthAfterChopping;
|
||||
// otherwise, maintain what's left of this last operator
|
||||
// otherwise, maintain what's left of this last operator
|
||||
else
|
||||
newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
|
||||
}
|
||||
|
|
@ -408,7 +406,7 @@ public class ClippingOp {
|
|||
}
|
||||
|
||||
// check if we are hard clipping indels
|
||||
while(cigarElementIterator.hasNext()) {
|
||||
while (cigarElementIterator.hasNext()) {
|
||||
cigarElement = cigarElementIterator.next();
|
||||
alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength());
|
||||
|
||||
|
|
@ -444,34 +442,30 @@ public class ClippingOp {
|
|||
boolean readHasStarted = false;
|
||||
boolean addedHardClips = false;
|
||||
|
||||
while(!cigarStack.empty()) {
|
||||
while (!cigarStack.empty()) {
|
||||
CigarElement cigarElement = cigarStack.pop();
|
||||
|
||||
if ( !readHasStarted &&
|
||||
cigarElement.getOperator() != CigarOperator.INSERTION &&
|
||||
if (!readHasStarted &&
|
||||
// cigarElement.getOperator() != CigarOperator.INSERTION &&
|
||||
cigarElement.getOperator() != CigarOperator.DELETION &&
|
||||
cigarElement.getOperator() != CigarOperator.HARD_CLIP)
|
||||
readHasStarted = true;
|
||||
|
||||
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP)
|
||||
else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP)
|
||||
totalHardClip += cigarElement.getLength();
|
||||
|
||||
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION)
|
||||
shift += cigarElement.getLength();
|
||||
|
||||
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
|
||||
else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
|
||||
totalHardClip += cigarElement.getLength();
|
||||
|
||||
if (readHasStarted) {
|
||||
if (i==1) {
|
||||
if (i == 1) {
|
||||
if (!addedHardClips) {
|
||||
if (totalHardClip > 0)
|
||||
inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
|
||||
addedHardClips = true;
|
||||
}
|
||||
inverseCigarStack.push(cigarElement);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if (!addedHardClips) {
|
||||
if (totalHardClip > 0)
|
||||
cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
|
||||
|
|
@ -498,7 +492,7 @@ public class ClippingOp {
|
|||
int newShift = 0;
|
||||
int oldShift = 0;
|
||||
|
||||
boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift
|
||||
boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift
|
||||
for (CigarElement cigarElement : newCigar.getCigarElements()) {
|
||||
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP)
|
||||
newShift += cigarElement.getLength();
|
||||
|
|
@ -509,7 +503,7 @@ public class ClippingOp {
|
|||
}
|
||||
|
||||
for (CigarElement cigarElement : oldCigar.getCigarElements()) {
|
||||
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP )
|
||||
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP)
|
||||
oldShift += cigarElement.getLength();
|
||||
else if (readHasStarted)
|
||||
break;
|
||||
|
|
@ -522,7 +516,7 @@ public class ClippingOp {
|
|||
if (cigarElement.getOperator() == CigarOperator.INSERTION)
|
||||
return -clippedLength;
|
||||
|
||||
// Deletions should be added to the total hard clip count
|
||||
// Deletions should be added to the total hard clip count
|
||||
else if (cigarElement.getOperator() == CigarOperator.DELETION)
|
||||
return cigarElement.getLength();
|
||||
|
||||
|
|
|
|||
|
|
@ -374,24 +374,43 @@ public class ReadClipper {
|
|||
* Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail
|
||||
* and hardClipByReferenceCoordinatesRightTail. Should not be used directly.
|
||||
*
|
||||
* Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the
|
||||
* left of right tail) by specifying either refStart < 0 or refStop < 0.
|
||||
*
|
||||
* @param refStart first base to clip (inclusive)
|
||||
* @param refStop last base to clip (inclusive)
|
||||
* @return a new read, without the clipped bases
|
||||
*/
|
||||
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
|
||||
@Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip
|
||||
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
|
||||
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
|
||||
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
if (read.isEmpty())
|
||||
return read;
|
||||
|
||||
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
|
||||
return GATKSAMRecord.emptyRead(read);
|
||||
// return new GATKSAMRecord(read.getHeader());
|
||||
int start;
|
||||
int stop;
|
||||
|
||||
// Determine the read coordinate to start and stop hard clipping
|
||||
if (refStart < 0) {
|
||||
if (refStop < 0)
|
||||
throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")");
|
||||
start = 0;
|
||||
stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
}
|
||||
else {
|
||||
if (refStop >= 0)
|
||||
throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")");
|
||||
start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
|
||||
stop = read.getReadLength() - 1;
|
||||
}
|
||||
|
||||
// if ((start == 0 && stop == read.getReadLength() - 1))
|
||||
// return GATKSAMRecord.emptyRead(read);
|
||||
|
||||
if (start < 0 || stop > read.getReadLength() - 1)
|
||||
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
|
||||
|
||||
if ( start > stop )
|
||||
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
|
||||
throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop));
|
||||
|
||||
if ( start > 0 && stop < read.getReadLength() - 1)
|
||||
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));
|
||||
|
|
|
|||
|
|
@ -526,6 +526,42 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the pileup for a set of read groups. Horrendously inefficient at this point.
|
||||
* @param rgSet List of identifiers for the read groups.
|
||||
* @return A read-backed pileup containing only the reads in the given read groups.
|
||||
*/
|
||||
@Override
|
||||
public RBP getPileupForReadGroups(final HashSet<String> rgSet) {
|
||||
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>)pileupElementTracker;
|
||||
PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
|
||||
|
||||
for(final String sample: tracker.getSamples()) {
|
||||
PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
|
||||
AbstractReadBackedPileup<RBP,PE> pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroups(rgSet);
|
||||
if(pileup != null)
|
||||
filteredTracker.addElements(sample,pileup.pileupElementTracker);
|
||||
}
|
||||
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
|
||||
}
|
||||
else {
|
||||
UnifiedPileupElementTracker<PE> filteredTracker = new UnifiedPileupElementTracker<PE>();
|
||||
for(PE p: pileupElementTracker) {
|
||||
GATKSAMRecord read = p.getRead();
|
||||
if(rgSet != null && !rgSet.isEmpty()) {
|
||||
if(read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId()))
|
||||
filteredTracker.add(p);
|
||||
}
|
||||
else {
|
||||
if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null)
|
||||
filteredTracker.add(p);
|
||||
}
|
||||
}
|
||||
return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public RBP getPileupForLane(String laneID) {
|
||||
if(pileupElementTracker instanceof PerSamplePileupElementTracker) {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -129,6 +130,13 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
|
|||
*/
|
||||
public ReadBackedPileup getPileupForReadGroup(String readGroupId);
|
||||
|
||||
/**
|
||||
* Gets all the reads associated with a given read groups.
|
||||
* @param rgSet Set of identifiers for the read group.
|
||||
* @return A pileup containing only the reads in the given read groups.
|
||||
*/
|
||||
public ReadBackedPileup getPileupForReadGroups(final HashSet<String> rgSet);
|
||||
|
||||
/**
|
||||
* Gets all reads in a given lane id. (Lane ID is the read group
|
||||
* id stripped of the last .XX sample identifier added by the GATK).
|
||||
|
|
|
|||
|
|
@ -238,7 +238,7 @@ public class ArtificialSAMUtils {
|
|||
*/
|
||||
public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar);
|
||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
|
||||
|
|
@ -184,6 +183,12 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
return getReducedReadCounts() != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* The number of bases corresponding the i'th base of the reduced read.
|
||||
*
|
||||
* @param i the read based coordinate inside the read
|
||||
* @return the number of bases corresponding to the i'th base of the reduced read
|
||||
*/
|
||||
public final byte getReducedCount(final int i) {
|
||||
byte firstCount = getReducedReadCounts()[0];
|
||||
byte offsetCount = getReducedReadCounts()[i];
|
||||
|
|
@ -277,7 +282,6 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
*
|
||||
* @return the unclipped start of the read taking soft clips (but not hard clips) into account
|
||||
*/
|
||||
@Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"})
|
||||
public int getSoftStart() {
|
||||
int start = this.getUnclippedStart();
|
||||
for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
|
||||
|
|
@ -286,17 +290,17 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
else
|
||||
break;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
|
||||
*
|
||||
* Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips.
|
||||
* Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
|
||||
*
|
||||
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
|
||||
*/
|
||||
@Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"})
|
||||
public int getSoftEnd() {
|
||||
int stop = this.getUnclippedStart();
|
||||
|
||||
|
|
@ -313,6 +317,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
else
|
||||
shift = 0;
|
||||
}
|
||||
|
||||
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import com.google.java.contract.Ensures;
|
|||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -58,7 +59,7 @@ public class ReadUtils {
|
|||
|
||||
/**
|
||||
* A HashMap of the SAM spec read flag names
|
||||
* <p/>
|
||||
*
|
||||
* Note: This is not being used right now, but can be useful in the future
|
||||
*/
|
||||
private static final Map<Integer, String> readFlagNames = new HashMap<Integer, String>();
|
||||
|
|
@ -79,49 +80,47 @@ public class ReadUtils {
|
|||
|
||||
/**
|
||||
* This enum represents all the different ways in which a read can overlap an interval.
|
||||
* <p/>
|
||||
*
|
||||
* NO_OVERLAP_CONTIG:
|
||||
* read and interval are in different contigs.
|
||||
* <p/>
|
||||
*
|
||||
* NO_OVERLAP_LEFT:
|
||||
* the read does not overlap the interval.
|
||||
* <p/>
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
* <p/>
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* NO_OVERLAP_RIGHT:
|
||||
* the read does not overlap the interval.
|
||||
* <p/>
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
* <p/>
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* OVERLAP_LEFT:
|
||||
* the read starts before the beginning of the interval but ends inside of it
|
||||
* <p/>
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
* <p/>
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* OVERLAP_RIGHT:
|
||||
* the read starts inside the interval but ends outside of it
|
||||
* <p/>
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
* <p/>
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <----------------> (read)
|
||||
*
|
||||
* OVERLAP_LEFT_AND_RIGHT:
|
||||
* the read starts before the interval and ends after the interval
|
||||
* <p/>
|
||||
* |-----------| (interval)
|
||||
* <-------------------> (read)
|
||||
* <p/>
|
||||
*
|
||||
* |-----------| (interval)
|
||||
* <-------------------> (read)
|
||||
*
|
||||
* OVERLAP_CONTAINED:
|
||||
* the read starts and ends inside the interval
|
||||
* <p/>
|
||||
* |----------------| (interval)
|
||||
* <--------> (read)
|
||||
*
|
||||
* |----------------| (interval)
|
||||
* <--------> (read)
|
||||
*/
|
||||
public enum ReadAndIntervalOverlap {
|
||||
NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED
|
||||
}
|
||||
public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}
|
||||
|
||||
/**
|
||||
* Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular
|
||||
|
|
@ -141,15 +140,15 @@ public class ReadUtils {
|
|||
|
||||
/**
|
||||
* is this base inside the adaptor of the read?
|
||||
* <p/>
|
||||
*
|
||||
* There are two cases to treat here:
|
||||
* <p/>
|
||||
*
|
||||
* 1) Read is in the negative strand => Adaptor boundary is on the left tail
|
||||
* 2) Read is in the positive strand => Adaptor boundary is on the right tail
|
||||
* <p/>
|
||||
*
|
||||
* Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event)
|
||||
*
|
||||
* @param read the read to test
|
||||
* @param read the read to test
|
||||
* @param basePos base position in REFERENCE coordinates (not read coordinates)
|
||||
* @return whether or not the base is in the adaptor
|
||||
*/
|
||||
|
|
@ -166,22 +165,22 @@ public class ReadUtils {
|
|||
* the read boundary. If the read is in the positive strand, this is the first base after the end of the
|
||||
* fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the
|
||||
* beginning of the fragment.
|
||||
* <p/>
|
||||
*
|
||||
* There are two cases we need to treat here:
|
||||
* <p/>
|
||||
*
|
||||
* 1) Our read is in the reverse strand :
|
||||
* <p/>
|
||||
* <----------------------| *
|
||||
* |--------------------->
|
||||
* <p/>
|
||||
* in these cases, the adaptor boundary is at the mate start (minus one)
|
||||
* <p/>
|
||||
*
|
||||
* <----------------------| *
|
||||
* |--------------------->
|
||||
*
|
||||
* in these cases, the adaptor boundary is at the mate start (minus one)
|
||||
*
|
||||
* 2) Our read is in the forward strand :
|
||||
* <p/>
|
||||
* |----------------------> *
|
||||
* <----------------------|
|
||||
* <p/>
|
||||
* in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
|
||||
*
|
||||
* |----------------------> *
|
||||
* <----------------------|
|
||||
*
|
||||
* in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
|
||||
*
|
||||
* @param read the read being tested for the adaptor boundary
|
||||
* @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig.
|
||||
|
|
@ -264,7 +263,7 @@ public class ReadUtils {
|
|||
|
||||
/**
|
||||
* If a read starts in INSERTION, returns the first element length.
|
||||
* <p/>
|
||||
*
|
||||
* Warning: If the read has Hard or Soft clips before the insertion this function will return 0.
|
||||
*
|
||||
* @param read
|
||||
|
|
@ -272,7 +271,7 @@ public class ReadUtils {
|
|||
*/
|
||||
public final static int getFirstInsertionOffset(SAMRecord read) {
|
||||
CigarElement e = read.getCigar().getCigarElement(0);
|
||||
if (e.getOperator() == CigarOperator.I)
|
||||
if ( e.getOperator() == CigarOperator.I )
|
||||
return e.getLength();
|
||||
else
|
||||
return 0;
|
||||
|
|
@ -280,7 +279,7 @@ public class ReadUtils {
|
|||
|
||||
/**
|
||||
* If a read ends in INSERTION, returns the last element length.
|
||||
* <p/>
|
||||
*
|
||||
* Warning: If the read has Hard or Soft clips after the insertion this function will return 0.
|
||||
*
|
||||
* @param read
|
||||
|
|
@ -288,7 +287,7 @@ public class ReadUtils {
|
|||
*/
|
||||
public final static int getLastInsertionOffset(SAMRecord read) {
|
||||
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1);
|
||||
if (e.getOperator() == CigarOperator.I)
|
||||
if ( e.getOperator() == CigarOperator.I )
|
||||
return e.getLength();
|
||||
else
|
||||
return 0;
|
||||
|
|
@ -297,8 +296,7 @@ public class ReadUtils {
|
|||
/**
|
||||
* Determines what is the position of the read in relation to the interval.
|
||||
* Note: This function uses the UNCLIPPED ENDS of the reads for the comparison.
|
||||
*
|
||||
* @param read the read
|
||||
* @param read the read
|
||||
* @param interval the interval
|
||||
* @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
|
||||
*/
|
||||
|
|
@ -309,30 +307,30 @@ public class ReadUtils {
|
|||
int uStart = read.getUnclippedStart();
|
||||
int uStop = read.getUnclippedEnd();
|
||||
|
||||
if (!read.getReferenceName().equals(interval.getContig()))
|
||||
if ( !read.getReferenceName().equals(interval.getContig()) )
|
||||
return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG;
|
||||
|
||||
else if (uStop < interval.getStart())
|
||||
else if ( uStop < interval.getStart() )
|
||||
return ReadAndIntervalOverlap.NO_OVERLAP_LEFT;
|
||||
|
||||
else if (uStart > interval.getStop())
|
||||
else if ( uStart > interval.getStop() )
|
||||
return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT;
|
||||
|
||||
else if (sStop < interval.getStart())
|
||||
else if ( sStop < interval.getStart() )
|
||||
return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT;
|
||||
|
||||
else if (sStart > interval.getStop())
|
||||
else if ( sStart > interval.getStop() )
|
||||
return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT;
|
||||
|
||||
else if ((sStart >= interval.getStart()) &&
|
||||
(sStop <= interval.getStop()))
|
||||
else if ( (sStart >= interval.getStart()) &&
|
||||
(sStop <= interval.getStop()) )
|
||||
return ReadAndIntervalOverlap.OVERLAP_CONTAINED;
|
||||
|
||||
else if ((sStart < interval.getStart()) &&
|
||||
(sStop > interval.getStop()))
|
||||
else if ( (sStart < interval.getStart()) &&
|
||||
(sStop > interval.getStop()) )
|
||||
return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT;
|
||||
|
||||
else if ((sStart < interval.getStart()))
|
||||
else if ( (sStart < interval.getStart()) )
|
||||
return ReadAndIntervalOverlap.OVERLAP_LEFT;
|
||||
|
||||
else
|
||||
|
|
@ -340,36 +338,52 @@ public class ReadUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in
|
||||
* a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns
|
||||
* the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the
|
||||
* deletion.
|
||||
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of
|
||||
* two corner cases:
|
||||
*
|
||||
* 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside
|
||||
* a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it
|
||||
* doesn't matter because it already returns the previous base by default.
|
||||
*
|
||||
* 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the
|
||||
* read starts with an insertion, and you're requesting the first read based coordinate, it will skip
|
||||
* the leading insertion (because it has the same reference coordinate as the following base).
|
||||
*
|
||||
* @param read
|
||||
* @param refCoord
|
||||
* @param tail
|
||||
* @return the read coordinate corresponding to the requested reference coordinate for clipping.
|
||||
*/
|
||||
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
|
||||
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"})
|
||||
@Ensures({"result >= 0", "result < read.getReadLength()"})
|
||||
public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) {
|
||||
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord);
|
||||
int readCoord = result.getFirst();
|
||||
|
||||
// Corner case one: clipping the right tail and falls on deletion, move to the next
|
||||
// read coordinate. It is not a problem for the left tail because the default answer
|
||||
// from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate.
|
||||
if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL)
|
||||
readCoord++;
|
||||
|
||||
// clipping the left tail and first base is insertion, go to the next read coordinate
|
||||
// with the same reference coordinate. Advance to the next cigar element, or to the
|
||||
// end of the read if there is no next element.
|
||||
Pair<Boolean, CigarElement> firstElementIsInsertion = readStartsWithInsertion(read);
|
||||
if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst())
|
||||
readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1);
|
||||
|
||||
return readCoord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the read coordinate corresponding to the requested reference coordinate.
|
||||
* <p/>
|
||||
*
|
||||
* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function
|
||||
* will return the last read base before the deletion. This function returns a
|
||||
* Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with
|
||||
* a deletion.
|
||||
* <p/>
|
||||
*
|
||||
* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a
|
||||
* pre-processed result according to normal clipping needs. Or you can use this function and tailor the
|
||||
* behavior to your needs.
|
||||
|
|
@ -421,7 +435,7 @@ public class ReadUtils {
|
|||
if (endsWithinCigar)
|
||||
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
|
||||
|
||||
// if we end outside the current cigar element, we need to check if the next element is an insertion or deletion.
|
||||
// if we end outside the current cigar element, we need to check if the next element is an insertion or deletion.
|
||||
else {
|
||||
nextCigarElement = cigarElementIterator.next();
|
||||
|
||||
|
|
@ -442,13 +456,13 @@ public class ReadUtils {
|
|||
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
|
||||
readBases += shift;
|
||||
|
||||
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
|
||||
// to add the shift of the current cigar element but go back to it's last element to return the last
|
||||
// base before the deletion (see warning in function contracts)
|
||||
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
|
||||
// to add the shift of the current cigar element but go back to it's last element to return the last
|
||||
// base before the deletion (see warning in function contracts)
|
||||
else if (fallsInsideDeletion && !endsWithinCigar)
|
||||
readBases += shift - 1;
|
||||
|
||||
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion
|
||||
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion
|
||||
else if (fallsInsideDeletion && endsWithinCigar)
|
||||
readBases--;
|
||||
}
|
||||
|
|
@ -457,7 +471,6 @@ public class ReadUtils {
|
|||
if (!goalReached)
|
||||
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
|
||||
|
||||
|
||||
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
|
||||
}
|
||||
|
||||
|
|
@ -465,12 +478,11 @@ public class ReadUtils {
|
|||
* Compares two SAMRecords only the basis on alignment start. Note that
|
||||
* comparisons are performed ONLY on the basis of alignment start; any
|
||||
* two SAM records with the same alignment start will be considered equal.
|
||||
* <p/>
|
||||
*
|
||||
* Unmapped alignments will all be considered equal.
|
||||
*/
|
||||
|
||||
@Requires({"read1 != null", "read2 != null"})
|
||||
@Ensures("result == 0 || result == 1 || result == -1")
|
||||
public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) {
|
||||
AlignmentStartComparator comp = new AlignmentStartComparator();
|
||||
return comp.compare(read1, read2);
|
||||
|
|
@ -479,7 +491,7 @@ public class ReadUtils {
|
|||
/**
|
||||
* Is a base inside a read?
|
||||
*
|
||||
* @param read the read to evaluate
|
||||
* @param read the read to evaluate
|
||||
* @param referenceCoordinate the reference coordinate of the base to test
|
||||
* @return true if it is inside the read, false otherwise.
|
||||
*/
|
||||
|
|
@ -502,4 +514,134 @@ public class ReadUtils {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if a read starts with an insertion. It looks beyond Hard and Soft clips
|
||||
* if there are any.
|
||||
*
|
||||
* @param read
|
||||
* @return A pair with the answer (true/false) and the element or null if it doesn't exist
|
||||
*/
|
||||
public static Pair<Boolean, CigarElement> readStartsWithInsertion(GATKSAMRecord read) {
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
|
||||
if (cigarElement.getOperator() == CigarOperator.INSERTION)
|
||||
return new Pair<Boolean, CigarElement>(true, cigarElement);
|
||||
|
||||
else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP)
|
||||
break;
|
||||
}
|
||||
return new Pair<Boolean, CigarElement>(false, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the coverage distribution of a list of reads within the desired region.
|
||||
*
|
||||
* See getCoverageDistributionOfRead for information on how the coverage is calculated.
|
||||
*
|
||||
* @param list the list of reads covering the region
|
||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @return an array with the coverage of each position from startLocation to stopLocation
|
||||
*/
|
||||
public static int [] getCoverageDistributionOfReads(List<GATKSAMRecord> list, int startLocation, int stopLocation) {
|
||||
int [] totalCoverage = new int[stopLocation - startLocation + 1];
|
||||
|
||||
for (GATKSAMRecord read : list) {
|
||||
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
|
||||
totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage);
|
||||
}
|
||||
|
||||
return totalCoverage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the coverage distribution of a single read within the desired region.
|
||||
*
|
||||
* Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
|
||||
* reads for variant regions, and deletions count as variants)
|
||||
*
|
||||
* @param read the read to get the coverage distribution of
|
||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @return an array with the coverage of each position from startLocation to stopLocation
|
||||
*/
|
||||
public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) {
|
||||
int [] coverage = new int[stopLocation - startLocation + 1];
|
||||
int refLocation = read.getSoftStart();
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
|
||||
switch (cigarElement.getOperator()) {
|
||||
case S:
|
||||
case M:
|
||||
case EQ:
|
||||
case N:
|
||||
case X:
|
||||
case D:
|
||||
for (int i = 0; i < cigarElement.getLength(); i++) {
|
||||
if (refLocation >= startLocation && refLocation <= stopLocation) {
|
||||
int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1;
|
||||
coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases
|
||||
}
|
||||
refLocation++;
|
||||
}
|
||||
break;
|
||||
|
||||
case P:
|
||||
case I:
|
||||
case H:
|
||||
break;
|
||||
}
|
||||
|
||||
if (refLocation > stopLocation)
|
||||
break;
|
||||
}
|
||||
return coverage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes association maps for the reads and loci coverage as described below :
|
||||
*
|
||||
* - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage.
|
||||
* Note: Locus is in reference coordinates.
|
||||
* Example: Locus => {read1, read2, ..., readN}
|
||||
*
|
||||
* - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage.
|
||||
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage.
|
||||
* Example: Read => {true, true, false, ... false}
|
||||
*
|
||||
* @param readList the list of reads to generate the association mappings
|
||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @return the two hashmaps described above
|
||||
*/
|
||||
public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>> , HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings (List<GATKSAMRecord> readList, int startLocation, int stopLocation) {
|
||||
int arraySize = stopLocation - startLocation + 1;
|
||||
|
||||
HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
|
||||
HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);
|
||||
|
||||
|
||||
for (int i = startLocation; i <= stopLocation; i++)
|
||||
locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists
|
||||
|
||||
for (GATKSAMRecord read : readList) {
|
||||
readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays
|
||||
|
||||
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
|
||||
|
||||
for (int i=0; i<readCoverage.length; i++) {
|
||||
int refLocation = i + startLocation;
|
||||
if (readCoverage[i] > 0) {
|
||||
// Update the hash for this locus
|
||||
HashSet<GATKSAMRecord> readSet = locusToReadMap.get(refLocation);
|
||||
readSet.add(read);
|
||||
|
||||
// Add this locus to the read hash
|
||||
readToLocusMap.get(read)[refLocation - startLocation] = true;
|
||||
}
|
||||
else
|
||||
// Update the boolean array with a 'no coverage' from this read to this locus
|
||||
readToLocusMap.get(read)[refLocation-startLocation] = false;
|
||||
}
|
||||
}
|
||||
return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -145,19 +145,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test
|
||||
public void testSnpEffAnnotations() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " +
|
||||
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
|
||||
"snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
|
||||
"snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
|
||||
1,
|
||||
Arrays.asList("51258f5c880bd1ca3eb45a1711335c66")
|
||||
Arrays.asList("ffbda45b3682c9b83cb541d83f6c15d6")
|
||||
);
|
||||
executeTest("Testing SnpEff annotations", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test
|
||||
public void testSnpEffAnnotationsUnsupportedVersion() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " +
|
||||
|
|
|
|||
|
|
@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testOutputParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" );
|
||||
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" );
|
||||
e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" );
|
||||
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" );
|
||||
e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -265,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("fa4f3ee67d98b64102a8a3ec81a3bc81"));
|
||||
Arrays.asList("c60a44ba94a80a0cb1fba8b6f90a13cd"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("df90890e43d735573a3b3e4f289ca46b"));
|
||||
Arrays.asList("36ce53ae4319718ad9c8ae391deebc8c"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
||||
Arrays.asList("cff6dd0f4eb1ef0b6fc476da6ffead19"));
|
||||
Arrays.asList("d356cbaf240d7025d1aecdabaff3a3e0"));
|
||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
||||
}
|
||||
|
||||
|
|
@ -294,11 +294,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||
Arrays.asList("1e2a4aab26e9ab0dae709d33a669e036"));
|
||||
Arrays.asList("947c12ef2a8c29ae787cd11be8c565c8"));
|
||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test
|
||||
public void testSnpEffAnnotationRequestedWithoutRodBinding() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " +
|
||||
|
|
|
|||
|
|
@ -118,6 +118,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibrator1", spec);
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("testTableRecalibrator1: paramsFile was null");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -144,7 +147,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(dependsOnMethods = "testCountCovariates1")
|
||||
public void testTableRecalibratorMaxQ70() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0b7123ae9f4155484b68e4a4f96c5504" );
|
||||
|
|
@ -170,6 +173,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibratorMaxQ70", spec);
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("testTableRecalibratorMaxQ70: paramsFile was null");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -199,7 +205,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(dependsOnMethods = "testCountCovariatesSolidIndelsRemoveRefBias")
|
||||
public void testTableRecalibratorSolidIndelsRemoveRefBias() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" );
|
||||
|
|
@ -224,6 +230,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibratorSolidIndelsRemoveRefBias", spec);
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("testTableRecalibratorSolidIndelsRemoveRefBias: paramsFile was null");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -305,7 +314,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Test(dependsOnMethods = "testCountCovariatesNoIndex")
|
||||
public void testTableRecalibratorNoIndex() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "991f093a0e610df235d28ada418ebf33" );
|
||||
|
|
@ -329,6 +338,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(md5));
|
||||
executeTest("testTableRecalibratorNoIndex", spec);
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("testTableRecalibratorNoIndex: paramsFile was null");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -14,19 +14,19 @@ public class VariantEvalIntegrationTest extends WalkerTest {
|
|||
private static String cmdRoot = "-T VariantEval" +
|
||||
" -R " + b36KGReference;
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test
|
||||
public void testFunctionClassWithSnpeff() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
buildCommandLine(
|
||||
"-T VariantEval",
|
||||
"-R " + b37KGReference,
|
||||
"--dbsnp " + b37dbSNP132,
|
||||
"--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf",
|
||||
"--eval " + validationDataLocation + "snpEff2.0.5.AFR.unfiltered.VariantAnnotator.output.vcf",
|
||||
"-noEV",
|
||||
"-EV TiTvVariantEvaluator",
|
||||
"-noST",
|
||||
"-ST FunctionalClass",
|
||||
"-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf",
|
||||
"-L " + validationDataLocation + "snpEff2.0.5.AFR.unfiltered.VariantAnnotator.output.vcf",
|
||||
"-o %s"
|
||||
),
|
||||
1,
|
||||
|
|
@ -450,4 +450,21 @@ public class VariantEvalIntegrationTest extends WalkerTest {
|
|||
);
|
||||
executeTest("testIntervalStrat", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testModernVCFWithLargeIndels() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
buildCommandLine(
|
||||
"-T VariantEval",
|
||||
"-R " + b37KGReference,
|
||||
"-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf",
|
||||
"-L 20",
|
||||
"-D " + b37dbSNP132,
|
||||
"-o %s"
|
||||
),
|
||||
1,
|
||||
Arrays.asList("a6f8b32fa732632da13dfe3ddcc73cef")
|
||||
);
|
||||
executeTest("testModernVCFWithLargeIndels", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,22 +26,20 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Basic unit test for MathUtils
|
||||
*/
|
||||
public class MathUtilsUnitTest extends BaseTest {
|
||||
@BeforeClass
|
||||
public void init() { }
|
||||
public void init() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we get the right values from the binomial distribution
|
||||
|
|
@ -66,20 +64,20 @@ public class MathUtilsUnitTest extends BaseTest {
|
|||
public void testMultinomialProbability() {
|
||||
logger.warn("Executing testMultinomialProbability");
|
||||
|
||||
int[] counts0 = { 2, 0, 1 };
|
||||
double[] probs0 = { 0.33, 0.33, 0.34 };
|
||||
int[] counts0 = {2, 0, 1};
|
||||
double[] probs0 = {0.33, 0.33, 0.34};
|
||||
Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6);
|
||||
|
||||
int[] counts1 = { 10, 20, 30 };
|
||||
double[] probs1 = { 0.25, 0.25, 0.50 };
|
||||
int[] counts1 = {10, 20, 30};
|
||||
double[] probs1 = {0.25, 0.25, 0.50};
|
||||
Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9);
|
||||
|
||||
int[] counts2 = { 38, 82, 50, 36 };
|
||||
double[] probs2 = { 0.25, 0.25, 0.25, 0.25 };
|
||||
int[] counts2 = {38, 82, 50, 36};
|
||||
double[] probs2 = {0.25, 0.25, 0.25, 0.25};
|
||||
Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10);
|
||||
|
||||
int[] counts3 = { 1, 600, 1 };
|
||||
double[] probs3 = { 0.33, 0.33, 0.34 };
|
||||
int[] counts3 = {1, 600, 1};
|
||||
double[] probs3 = {0.33, 0.33, 0.34};
|
||||
Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286);
|
||||
}
|
||||
|
||||
|
|
@ -123,19 +121,21 @@ public class MathUtilsUnitTest extends BaseTest {
|
|||
Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha));
|
||||
}
|
||||
|
||||
/** Tests that we correctly compute mean and standard deviation from a stream of numbers */
|
||||
/**
|
||||
* Tests that we correctly compute mean and standard deviation from a stream of numbers
|
||||
*/
|
||||
@Test
|
||||
public void testRunningAverage() {
|
||||
logger.warn("Executing testRunningAverage");
|
||||
|
||||
int [] numbers = {1,2,4,5,3,128,25678,-24};
|
||||
int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24};
|
||||
MathUtils.RunningAverage r = new MathUtils.RunningAverage();
|
||||
|
||||
for ( int i = 0 ; i < numbers.length ; i++ ) r.add((double)numbers[i]);
|
||||
for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]);
|
||||
|
||||
Assert.assertEquals((long)numbers.length, r.observationCount());
|
||||
Assert.assertTrue(r.mean()- 3224.625 < 2e-10 );
|
||||
Assert.assertTrue(r.stddev()-9072.6515881128 < 2e-10);
|
||||
Assert.assertEquals((long) numbers.length, r.observationCount());
|
||||
Assert.assertTrue(r.mean() - 3224.625 < 2e-10);
|
||||
Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -174,4 +174,56 @@ public class MathUtilsUnitTest extends BaseTest {
|
|||
Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testRandomSubset() {
|
||||
Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10);
|
||||
Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10);
|
||||
|
||||
for (int i = 0; i < 25; i++)
|
||||
Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5)));
|
||||
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testArrayShuffle() {
|
||||
Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||
for (int i = 0; i < 25; i++) {
|
||||
Object[] t = MathUtils.arrayShuffle(x);
|
||||
Assert.assertTrue(hasUniqueElements(t));
|
||||
Assert.assertTrue(hasAllElements(x, t));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasUniqueElements(Object[] x) {
|
||||
for (int i = 0; i < x.length; i++)
|
||||
for (int j = i + 1; j < x.length; j++)
|
||||
if (x[i].equals(x[j]) || x[i] == x[j])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean hasAllElements(final Object[] expected, final Object[] actual) {
|
||||
HashSet<Object> set = new HashSet<Object>();
|
||||
set.addAll(Arrays.asList(expected));
|
||||
set.removeAll(Arrays.asList(actual));
|
||||
return set.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
private void p (Object []x) {
|
||||
for (Object v: x)
|
||||
System.out.print((Integer) v + " ");
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,8 +112,9 @@ public class ReadClipperTestUtils {
|
|||
}
|
||||
}
|
||||
|
||||
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION)
|
||||
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
|
||||
// if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION)
|
||||
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION)
|
||||
return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
@ -190,4 +191,18 @@ public class ReadClipperTestUtils {
|
|||
return invertedCigar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether or not the read has any cigar element that is not H or S
|
||||
*
|
||||
* @param read
|
||||
* @return true if it has any M, I or D, false otherwise
|
||||
*/
|
||||
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements())
|
||||
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,12 +30,12 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -59,10 +59,11 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
int alnStart = read.getAlignmentStart();
|
||||
int alnEnd = read.getAlignmentEnd();
|
||||
int readLength = alnStart - alnEnd;
|
||||
for (int i=0; i<readLength/2; i++) {
|
||||
for (int i = 0; i < readLength / 2; i++) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i);
|
||||
Assert.assertTrue(clippedRead.getAlignmentStart() >= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString()));
|
||||
Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString()));
|
||||
assertUnclippedLimits(read, clippedRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -72,12 +73,14 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
for (Cigar cigar : cigarList) {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
int readLength = read.getReadLength();
|
||||
for (int i=0; i<readLength; i++) {
|
||||
for (int i = 0; i < readLength; i++) {
|
||||
GATKSAMRecord clipLeft = ReadClipper.hardClipByReadCoordinates(read, 0, i);
|
||||
Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString()));
|
||||
Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString()));
|
||||
assertUnclippedLimits(read, clipLeft);
|
||||
|
||||
GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength-1);
|
||||
GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1);
|
||||
Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString()));
|
||||
assertUnclippedLimits(read, clipRight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -86,19 +89,27 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
public void testHardClipByReferenceCoordinates() {
|
||||
for (Cigar cigar : cigarList) {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
int alnStart = read.getAlignmentStart();
|
||||
int alnEnd = read.getAlignmentEnd();
|
||||
for (int i=alnStart; i<=alnEnd; i++) {
|
||||
if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side
|
||||
GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i);
|
||||
if (!clipLeft.isEmpty())
|
||||
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
|
||||
int start = read.getSoftStart();
|
||||
int stop = read.getSoftEnd();
|
||||
|
||||
// System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop));
|
||||
|
||||
// if (ReadUtils.readIsEntirelyInsertion(read))
|
||||
// System.out.println("debug");
|
||||
|
||||
for (int i = start; i <= stop; i++) {
|
||||
GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i);
|
||||
if (!clipLeft.isEmpty()) {
|
||||
// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString()));
|
||||
Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
|
||||
assertUnclippedLimits(read, clipLeft);
|
||||
}
|
||||
|
||||
if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side
|
||||
GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd);
|
||||
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
|
||||
Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
|
||||
GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1);
|
||||
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
|
||||
// System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString()));
|
||||
Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
|
||||
assertUnclippedLimits(read, clipRight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -111,10 +122,14 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
int alnStart = read.getAlignmentStart();
|
||||
int alnEnd = read.getAlignmentEnd();
|
||||
if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side
|
||||
for (int i=alnStart; i<=alnEnd; i++) {
|
||||
GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i);
|
||||
if (!clipLeft.isEmpty())
|
||||
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
|
||||
for (int i = alnStart; i <= alnEnd; i++) {
|
||||
GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i);
|
||||
|
||||
if (!clipLeft.isEmpty()) {
|
||||
// System.out.println(String.format("Left Tail [%d]: %s (%d,%d,%d : %d,%d,%d) -> %s (%d,%d,%d : %d,%d,%d)", i, cigar.toString(), read.getUnclippedStart(), read.getSoftStart(), read.getAlignmentStart(), read.getAlignmentEnd(), read.getSoftEnd(), read.getUnclippedEnd(), clipLeft.getCigarString(), clipLeft.getUnclippedStart(), clipLeft.getSoftStart(), clipLeft.getAlignmentStart(), clipLeft.getAlignmentEnd(), clipLeft.getSoftEnd(), clipLeft.getUnclippedEnd()));
|
||||
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
|
||||
assertUnclippedLimits(read, clipLeft);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -127,10 +142,12 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
int alnStart = read.getAlignmentStart();
|
||||
int alnEnd = read.getAlignmentEnd();
|
||||
if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side
|
||||
for (int i=alnStart; i<=alnEnd; i++) {
|
||||
for (int i = alnStart; i <= alnEnd; i++) {
|
||||
GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i);
|
||||
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
|
||||
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
|
||||
Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
|
||||
assertUnclippedLimits(read, clipRight);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -145,43 +162,36 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
for (Cigar cigar : cigarList) {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
int readLength = read.getReadLength();
|
||||
byte [] quals = new byte[readLength];
|
||||
byte[] quals = new byte[readLength];
|
||||
|
||||
for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) {
|
||||
|
||||
// create a read with nLowQualBases in the left tail
|
||||
Utils.fillArrayWithByte(quals, HIGH_QUAL);
|
||||
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail
|
||||
for (int addLeft = 0; addLeft < nLowQualBases; addLeft++)
|
||||
quals[addLeft] = LOW_QUAL;
|
||||
read.setBaseQualities(quals);
|
||||
GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
|
||||
|
||||
// Tests
|
||||
assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed
|
||||
assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone
|
||||
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
|
||||
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
|
||||
|
||||
// Make sure the low qualities are gone
|
||||
assertNoLowQualBases(clipLeft, LOW_QUAL);
|
||||
|
||||
// Can't run this test with the current contract of no hanging insertions
|
||||
// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
|
||||
|
||||
// create a read with nLowQualBases in the right tail
|
||||
Utils.fillArrayWithByte(quals, HIGH_QUAL);
|
||||
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail
|
||||
for (int addRight = 0; addRight < nLowQualBases; addRight++)
|
||||
quals[readLength - addRight - 1] = LOW_QUAL;
|
||||
read.setBaseQualities(quals);
|
||||
GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
|
||||
|
||||
// Tests
|
||||
// System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString()));
|
||||
|
||||
// Make sure the low qualities are gone
|
||||
assertNoLowQualBases(clipRight, LOW_QUAL);
|
||||
assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed
|
||||
assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone
|
||||
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
|
||||
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString()));
|
||||
|
||||
// Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions
|
||||
//Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString()));
|
||||
|
||||
// create a read with nLowQualBases in the both tails
|
||||
if (nLowQualBases <= readLength/2) {
|
||||
Utils.fillArrayWithByte(quals, HIGH_QUAL);
|
||||
if (nLowQualBases <= readLength / 2) {
|
||||
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails
|
||||
for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) {
|
||||
quals[addBoth] = LOW_QUAL;
|
||||
quals[readLength - addBoth - 1] = LOW_QUAL;
|
||||
|
|
@ -189,83 +199,25 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
read.setBaseQualities(quals);
|
||||
GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
|
||||
|
||||
// Tests
|
||||
|
||||
// Make sure the low qualities are gone
|
||||
assertNoLowQualBases(clipBoth, LOW_QUAL);
|
||||
|
||||
// Can't run this test with the current contract of no hanging insertions
|
||||
//Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
|
||||
assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed
|
||||
assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone
|
||||
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
|
||||
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
|
||||
}
|
||||
}
|
||||
// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString()));
|
||||
}
|
||||
|
||||
// ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug )
|
||||
final byte[] BASES = {'A','C','G','T','A','C','G','T'};
|
||||
final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2};
|
||||
final String CIGAR = "1S1M5I1S";
|
||||
|
||||
final byte[] CLIPPED_BASES = {};
|
||||
final byte[] CLIPPED_QUALS = {};
|
||||
final String CLIPPED_CIGAR = "";
|
||||
|
||||
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR);
|
||||
GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR);
|
||||
|
||||
ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHardClipSoftClippedBases() {
|
||||
|
||||
// Generate a list of cigars to test
|
||||
for (Cigar cigar : cigarList) {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read);
|
||||
CigarCounter original = new CigarCounter(read);
|
||||
CigarCounter clipped = new CigarCounter(clippedRead);
|
||||
|
||||
int sumHardClips = 0;
|
||||
int sumMatches = 0;
|
||||
|
||||
boolean tail = true;
|
||||
for (CigarElement element : read.getCigar().getCigarElements()) {
|
||||
// Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right)
|
||||
if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP)
|
||||
tail = true;
|
||||
|
||||
// Adds all H, S and D's (next to hard/soft clips).
|
||||
// All these should be hard clips after clipping.
|
||||
if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION))
|
||||
sumHardClips += element.getLength();
|
||||
|
||||
// this means we're no longer on the tail (insertions can still potentially be the tail because
|
||||
// of the current contract of clipping out hanging insertions
|
||||
else if (element.getOperator() != CigarOperator.INSERTION)
|
||||
tail = false;
|
||||
|
||||
// Adds all matches to verify that they remain the same after clipping
|
||||
if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
|
||||
sumMatches += element.getLength();
|
||||
}
|
||||
|
||||
for (CigarElement element : clippedRead.getCigar().getCigarElements()) {
|
||||
// Test if clipped read has Soft Clips (shouldn't have any!)
|
||||
Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString()));
|
||||
|
||||
// Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for
|
||||
if (element.getOperator() == CigarOperator.HARD_CLIP)
|
||||
sumHardClips -= element.getLength();
|
||||
|
||||
// Make sure all matches are still there
|
||||
if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
|
||||
sumMatches -= element.getLength();
|
||||
}
|
||||
Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips));
|
||||
Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches));
|
||||
|
||||
|
||||
// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString()));
|
||||
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
|
||||
original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -276,38 +228,39 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read);
|
||||
|
||||
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
|
||||
|
||||
int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION);
|
||||
if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar()))
|
||||
expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION);
|
||||
|
||||
if (! clippedRead.isEmpty()) {
|
||||
if (!clippedRead.isEmpty()) {
|
||||
Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there
|
||||
Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone
|
||||
}
|
||||
else
|
||||
} else
|
||||
Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testRevertSoftClippedBases()
|
||||
{
|
||||
for (Cigar cigar: cigarList) {
|
||||
public void testRevertSoftClippedBases() {
|
||||
for (Cigar cigar : cigarList) {
|
||||
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
|
||||
final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
|
||||
|
||||
final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
|
||||
final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read);
|
||||
|
||||
if ( leadingSoftClips > 0 || tailSoftClips > 0) {
|
||||
assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed
|
||||
|
||||
if (leadingSoftClips > 0 || tailSoftClips > 0) {
|
||||
final int expectedStart = read.getAlignmentStart() - leadingSoftClips;
|
||||
final int expectedEnd = read.getAlignmentEnd() + tailSoftClips;
|
||||
|
||||
Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart);
|
||||
Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd);
|
||||
}
|
||||
else
|
||||
} else
|
||||
Assert.assertEquals(read.getCigarString(), unclipped.getCigarString());
|
||||
}
|
||||
}
|
||||
|
|
@ -315,12 +268,25 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
|
||||
private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) {
|
||||
if (!read.isEmpty()) {
|
||||
byte [] quals = read.getBaseQualities();
|
||||
for (int i=0; i<quals.length; i++)
|
||||
byte[] quals = read.getBaseQualities();
|
||||
for (int i = 0; i < quals.length; i++)
|
||||
Assert.assertFalse(quals[i] <= low_qual, String.format("Found low qual (%d) base after hard clipping. Position: %d -- %s", low_qual, i, read.getCigarString()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts that clipping doesn't change the getUnclippedStart / getUnclippedEnd
|
||||
*
|
||||
* @param original
|
||||
* @param clipped
|
||||
*/
|
||||
private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) {
|
||||
if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) {
|
||||
Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart());
|
||||
Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean startsWithInsertion(Cigar cigar) {
|
||||
return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0;
|
||||
}
|
||||
|
|
@ -335,10 +301,46 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
return 0;
|
||||
}
|
||||
|
||||
private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) {
|
||||
private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) {
|
||||
for (CigarElement cigarElement : cigar.getCigarElements())
|
||||
if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private class CigarCounter {
|
||||
private HashMap<CigarOperator, Integer> counter;
|
||||
|
||||
public Integer getCounterForOp(CigarOperator operator) {
|
||||
return counter.get(operator);
|
||||
}
|
||||
|
||||
public CigarCounter(GATKSAMRecord read) {
|
||||
CigarOperator[] operators = CigarOperator.values();
|
||||
counter = new HashMap<CigarOperator, Integer>(operators.length);
|
||||
|
||||
for (CigarOperator op : operators)
|
||||
counter.put(op, 0);
|
||||
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements())
|
||||
counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength());
|
||||
}
|
||||
|
||||
public boolean assertHardClippingSoftClips(CigarCounter clipped) {
|
||||
for (CigarOperator op : counter.keySet()) {
|
||||
if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) {
|
||||
int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP);
|
||||
int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP);
|
||||
int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP);
|
||||
|
||||
Assert.assertEquals(counterTotal, clippedHard);
|
||||
Assert.assertTrue(clippedSoft == 0);
|
||||
} else
|
||||
Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Binary file not shown.
|
|
@ -1,3 +1,3 @@
|
|||
<ivy-module version="1.0">
|
||||
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.4rc3" status="release" />
|
||||
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.5" status="release" />
|
||||
</ivy-module>
|
||||
Loading…
Reference in New Issue