Engine-level BAQ calculation now available in the GATK [totally experimental right now]. -baq argument to disable (NONE), to only use the tags in the BAM (USE_TAG_ONLY), use the tag when present but calculate on the fly as necessary (CALCULATE_AS_NECESSARY), and to always recalculate (RECALCULATE_ALWAYS). BAQ.java contains the complete implementation, for those interested. ValidateBAQWalker is a useful QC tool for verifying the BAQ is correct. BAQSamIterator applies BAQ to reads, as needed, in the engine. Let me know if you encounter any problems. Before prime-time, needs a caching implementation of IndexedFastaReader to avoid loading *lots* of reference data all of the time
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4787 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bdc7516168
commit
a5b3aac864
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk;
|
||||||
|
|
||||||
import net.sf.picard.filter.SamRecordFilter;
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
|
@ -318,7 +319,7 @@ public abstract class AbstractGenomeAnalysisEngine {
|
||||||
referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile);
|
referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile);
|
||||||
|
|
||||||
validateSuppliedReads();
|
validateSuppliedReads();
|
||||||
readsDataSource = createReadsDataSource(genomeLocParser);
|
readsDataSource = createReadsDataSource(genomeLocParser, referenceDataSource.getReference());
|
||||||
|
|
||||||
sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
|
sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
|
||||||
|
|
||||||
|
|
@ -540,7 +541,7 @@ public abstract class AbstractGenomeAnalysisEngine {
|
||||||
*
|
*
|
||||||
* @return A data source for the given set of reads.
|
* @return A data source for the given set of reads.
|
||||||
*/
|
*/
|
||||||
private SAMDataSource createReadsDataSource(GenomeLocParser genomeLocParser) {
|
private SAMDataSource createReadsDataSource(GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
|
||||||
DownsamplingMethod method = getDownsamplingMethod();
|
DownsamplingMethod method = getDownsamplingMethod();
|
||||||
|
|
||||||
return new SAMDataSource(
|
return new SAMDataSource(
|
||||||
|
|
@ -553,7 +554,8 @@ public abstract class AbstractGenomeAnalysisEngine {
|
||||||
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
|
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
|
||||||
filters,
|
filters,
|
||||||
includeReadsWithDeletionAtLoci(),
|
includeReadsWithDeletionAtLoci(),
|
||||||
generateExtendedEvents());
|
generateExtendedEvents(),
|
||||||
|
argCollection.BAQMode, refReader);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,12 @@
|
||||||
package org.broadinstitute.sting.gatk;
|
package org.broadinstitute.sting.gatk;
|
||||||
|
|
||||||
import net.sf.picard.filter.SamRecordFilter;
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMFileReader;
|
import net.sf.samtools.SAMFileReader;
|
||||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||||
|
import org.broadinstitute.sting.utils.BAQ;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
@ -35,7 +37,11 @@ public class ReadProperties {
|
||||||
private Collection<SamRecordFilter> supplementalFilters = null;
|
private Collection<SamRecordFilter> supplementalFilters = null;
|
||||||
private boolean includeReadsWithDeletionAtLoci = false;
|
private boolean includeReadsWithDeletionAtLoci = false;
|
||||||
private boolean useOriginalBaseQualities = false;
|
private boolean useOriginalBaseQualities = false;
|
||||||
private boolean generateExtendedEvents = false; // do we want to generate additional piles of "extended" events (indels)
|
private boolean generateExtendedEvents = false;
|
||||||
|
private BAQ.Mode mode = BAQ.Mode.NONE;
|
||||||
|
IndexedFastaSequenceFile refReader = null; // read for BAQ, if desired
|
||||||
|
|
||||||
|
// do we want to generate additional piles of "extended" events (indels)
|
||||||
// immediately after the reference base such event is associated with?
|
// immediately after the reference base such event is associated with?
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -120,6 +126,15 @@ public class ReadProperties {
|
||||||
return useOriginalBaseQualities;
|
return useOriginalBaseQualities;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public BAQ.Mode getBAQMode() {
|
||||||
|
return mode;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexedFastaSequenceFile getRefReader() {
|
||||||
|
return refReader;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract the command-line arguments having to do with reads input
|
* Extract the command-line arguments having to do with reads input
|
||||||
* files and store them in an easy-to-work-with package. Constructor
|
* files and store them in an easy-to-work-with package. Constructor
|
||||||
|
|
@ -138,6 +153,8 @@ public class ReadProperties {
|
||||||
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
|
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
|
||||||
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
|
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
|
||||||
* bases will be seen in the pileups, and the deletions will be skipped silently.
|
* bases will be seen in the pileups, and the deletions will be skipped silently.
|
||||||
|
* @param mode How should we apply the BAQ calculation to the reads?
|
||||||
|
* @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation
|
||||||
*/
|
*/
|
||||||
public ReadProperties( List<SAMReaderID> samFiles,
|
public ReadProperties( List<SAMReaderID> samFiles,
|
||||||
SAMFileHeader header,
|
SAMFileHeader header,
|
||||||
|
|
@ -148,7 +165,9 @@ public class ReadProperties {
|
||||||
ValidationExclusion exclusionList,
|
ValidationExclusion exclusionList,
|
||||||
Collection<SamRecordFilter> supplementalFilters,
|
Collection<SamRecordFilter> supplementalFilters,
|
||||||
boolean includeReadsWithDeletionAtLoci,
|
boolean includeReadsWithDeletionAtLoci,
|
||||||
boolean generateExtendedEvents) {
|
boolean generateExtendedEvents,
|
||||||
|
BAQ.Mode mode,
|
||||||
|
IndexedFastaSequenceFile refReader ) {
|
||||||
this.readers = samFiles;
|
this.readers = samFiles;
|
||||||
this.header = header;
|
this.header = header;
|
||||||
this.readBufferSize = readBufferSize;
|
this.readBufferSize = readBufferSize;
|
||||||
|
|
@ -159,5 +178,7 @@ public class ReadProperties {
|
||||||
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
|
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
|
||||||
this.generateExtendedEvents = generateExtendedEvents;
|
this.generateExtendedEvents = generateExtendedEvents;
|
||||||
this.useOriginalBaseQualities = useOriginalBaseQualities;
|
this.useOriginalBaseQualities = useOriginalBaseQualities;
|
||||||
|
this.mode = mode;
|
||||||
|
this.refReader = refReader;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.commandline.Input;
|
||||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||||
|
import org.broadinstitute.sting.utils.BAQ;
|
||||||
import org.simpleframework.xml.*;
|
import org.simpleframework.xml.*;
|
||||||
import org.simpleframework.xml.core.Persister;
|
import org.simpleframework.xml.core.Persister;
|
||||||
import org.simpleframework.xml.stream.Format;
|
import org.simpleframework.xml.stream.Format;
|
||||||
|
|
@ -149,6 +150,10 @@ public class GATKArgumentCollection {
|
||||||
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
|
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Element(required = false)
|
||||||
|
@Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false)
|
||||||
|
public BAQ.Mode BAQMode = BAQ.Mode.NONE;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the default downsampling method, returned if the user didn't specify any downsampling
|
* Gets the default downsampling method, returned if the user didn't specify any downsampling
|
||||||
* method.
|
* method.
|
||||||
|
|
@ -336,11 +341,9 @@ public class GATKArgumentCollection {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (BTIMergeRule != other.BTIMergeRule)
|
if (BTIMergeRule != other.BTIMergeRule)
|
||||||
return false;
|
return false;
|
||||||
|
if ( BAQMode != other.BAQMode)
|
||||||
// if (other.enableRodWalkers != this.enableRodWalkers) {
|
return false;
|
||||||
// return false;
|
|
||||||
// }
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.util.CloseableIterator;
|
||||||
import net.sf.picard.filter.SamRecordFilter;
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||||
import net.sf.picard.sam.MergingSamRecordIterator;
|
import net.sf.picard.sam.MergingSamRecordIterator;
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||||
|
|
@ -43,6 +44,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.BAQ;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
|
|
@ -141,6 +143,34 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* See complete constructor. Does not enable BAQ by default.
|
||||||
|
*/
|
||||||
|
public SAMDataSource(
|
||||||
|
List<SAMReaderID> samFiles,
|
||||||
|
GenomeLocParser genomeLocParser,
|
||||||
|
boolean useOriginalBaseQualities,
|
||||||
|
SAMFileReader.ValidationStringency strictness,
|
||||||
|
Integer readBufferSize,
|
||||||
|
DownsamplingMethod downsamplingMethod,
|
||||||
|
ValidationExclusion exclusionList,
|
||||||
|
Collection<SamRecordFilter> supplementalFilters,
|
||||||
|
boolean includeReadsWithDeletionAtLoci,
|
||||||
|
boolean generateExtendedEvents ) {
|
||||||
|
this( samFiles,
|
||||||
|
genomeLocParser,
|
||||||
|
useOriginalBaseQualities,
|
||||||
|
strictness,
|
||||||
|
readBufferSize,
|
||||||
|
downsamplingMethod,
|
||||||
|
exclusionList,
|
||||||
|
supplementalFilters,
|
||||||
|
includeReadsWithDeletionAtLoci,
|
||||||
|
generateExtendedEvents,
|
||||||
|
BAQ.Mode.NONE, null // no BAQ
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new SAM data source given the supplied read metadata.
|
* Create a new SAM data source given the supplied read metadata.
|
||||||
* @param samFiles list of reads files.
|
* @param samFiles list of reads files.
|
||||||
|
|
@ -167,7 +197,9 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
ValidationExclusion exclusionList,
|
ValidationExclusion exclusionList,
|
||||||
Collection<SamRecordFilter> supplementalFilters,
|
Collection<SamRecordFilter> supplementalFilters,
|
||||||
boolean includeReadsWithDeletionAtLoci,
|
boolean includeReadsWithDeletionAtLoci,
|
||||||
boolean generateExtendedEvents
|
boolean generateExtendedEvents,
|
||||||
|
BAQ.Mode Mode,
|
||||||
|
IndexedFastaSequenceFile refReader
|
||||||
) {
|
) {
|
||||||
this.readMetrics = new ReadMetrics();
|
this.readMetrics = new ReadMetrics();
|
||||||
this.genomeLocParser = genomeLocParser;
|
this.genomeLocParser = genomeLocParser;
|
||||||
|
|
@ -213,8 +245,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
exclusionList,
|
exclusionList,
|
||||||
supplementalFilters,
|
supplementalFilters,
|
||||||
includeReadsWithDeletionAtLoci,
|
includeReadsWithDeletionAtLoci,
|
||||||
generateExtendedEvents
|
generateExtendedEvents,
|
||||||
);
|
Mode, refReader );
|
||||||
|
|
||||||
// cache the read group id (original) -> read group id (merged)
|
// cache the read group id (original) -> read group id (merged)
|
||||||
// and read group id (merged) -> read group id (original) mappings.
|
// and read group id (merged) -> read group id (original) mappings.
|
||||||
|
|
@ -483,7 +515,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
||||||
readProperties.getDownsamplingMethod().toFraction,
|
readProperties.getDownsamplingMethod().toFraction,
|
||||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||||
readProperties.getSupplementalFilters());
|
readProperties.getSupplementalFilters(),
|
||||||
|
readProperties.getBAQMode(), readProperties.getRefReader());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -506,7 +539,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
||||||
readProperties.getDownsamplingMethod().toFraction,
|
readProperties.getDownsamplingMethod().toFraction,
|
||||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||||
readProperties.getSupplementalFilters());
|
readProperties.getSupplementalFilters(),
|
||||||
|
readProperties.getBAQMode(), readProperties.getRefReader());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -539,7 +573,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
StingSAMIterator wrappedIterator,
|
StingSAMIterator wrappedIterator,
|
||||||
Double downsamplingFraction,
|
Double downsamplingFraction,
|
||||||
Boolean noValidationOfReadOrder,
|
Boolean noValidationOfReadOrder,
|
||||||
Collection<SamRecordFilter> supplementalFilters) {
|
Collection<SamRecordFilter> supplementalFilters,
|
||||||
|
BAQ.Mode mode, IndexedFastaSequenceFile refReader ) {
|
||||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities);
|
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities);
|
||||||
|
|
||||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||||
|
|
@ -552,6 +587,9 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
if (!noValidationOfReadOrder && enableVerification)
|
if (!noValidationOfReadOrder && enableVerification)
|
||||||
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
|
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
|
||||||
|
|
||||||
|
if (mode != BAQ.Mode.NONE)
|
||||||
|
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, mode);
|
||||||
|
|
||||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||||
|
|
||||||
return wrappedIterator;
|
return wrappedIterator;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
import org.broadinstitute.sting.utils.BAQ;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple iterator that applies Heng's BAQ calculation to a stream of incoming reads
|
||||||
|
*/
|
||||||
|
public class BAQSamIterator implements StingSAMIterator {
|
||||||
|
StingSAMIterator it;
|
||||||
|
BAQ baqHMM = new BAQ(); // creates a BAQ creator with default parameters
|
||||||
|
IndexedFastaSequenceFile refReader = null;
|
||||||
|
BAQ.Mode mode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new BAMSamIterator using the reference getter refReader and applies the BAM to the reads coming
|
||||||
|
* in from it. See BAQ docs for baqType information.
|
||||||
|
*
|
||||||
|
* @param refReader
|
||||||
|
* @param it
|
||||||
|
* @param mode
|
||||||
|
*/
|
||||||
|
public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.Mode mode) {
|
||||||
|
this.refReader =refReader;
|
||||||
|
this.it = it;
|
||||||
|
this.mode = mode;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord next() { return baqHMM.baqRead(it.next(), refReader, mode); }
|
||||||
|
public boolean hasNext() { return this.it.hasNext(); }
|
||||||
|
public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); }
|
||||||
|
public void close() { it.close(); }
|
||||||
|
public Iterator<SAMRecord> iterator() { return this; }
|
||||||
|
}
|
||||||
|
|
@ -4,16 +4,11 @@ import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.util.RuntimeIOException;
|
import net.sf.samtools.util.RuntimeIOException;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Verifies that the incoming stream of reads is correctly sorted
|
||||||
* User: mdepristo
|
|
||||||
* Date: Mar 15, 2009
|
|
||||||
* Time: 6:02:31 PM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
*/
|
||||||
public class VerifyingSamIterator implements StingSAMIterator {
|
public class VerifyingSamIterator implements StingSAMIterator {
|
||||||
private GenomeLocParser genomeLocParser;
|
private GenomeLocParser genomeLocParser;
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class CountReadsWalker extends ReadWalker<Integer, Integer> {
|
public class CountReadsWalker extends ReadWalker<Integer, Integer> {
|
||||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||||
//System.out.println(read.format());
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,142 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.BAQ;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
|
||||||
|
* Can also count the number of reads matching a given criterion using read filters (see the
|
||||||
|
* --read-filter command line argument). Simplest example of a read-backed analysis.
|
||||||
|
*/
|
||||||
|
@Reference(window=@Window(start=-5,stop=5))
|
||||||
|
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||||
|
public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
@Output(doc="File to which results should be written",required=true)
|
||||||
|
protected PrintStream out;
|
||||||
|
|
||||||
|
@Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
|
||||||
|
protected int maxReadLen = 1000;
|
||||||
|
|
||||||
|
@Argument(doc="only operates on reads with this name",required=false)
|
||||||
|
protected String readName = null;
|
||||||
|
|
||||||
|
@Argument(doc="only prints out detailed information on the impact of BAQ when our implementation differences from the samtools BAQ tag", required=false)
|
||||||
|
protected boolean onlyPrintOnFailures = false;
|
||||||
|
|
||||||
|
@Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
|
||||||
|
protected boolean alsoPrintWarnings = false;
|
||||||
|
|
||||||
|
int counter = 0;
|
||||||
|
|
||||||
|
BAQ baqHMM = new BAQ(1e-3, 0.1, 7, 0); // matches current samtools parameters
|
||||||
|
|
||||||
|
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||||
|
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
|
||||||
|
|
||||||
|
if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && BAQ.hasBAQTag(read) ) {
|
||||||
|
byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, false);
|
||||||
|
if (counter++ % 1000 == 0) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
|
||||||
|
BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
|
||||||
|
|
||||||
|
boolean fail = false;
|
||||||
|
boolean print = false;
|
||||||
|
int badi = 0;
|
||||||
|
for ( badi = 0; badi < baqFromTag.length; badi++ ) {
|
||||||
|
if ( baqFromTag[badi] != baq.bq[badi] ) {
|
||||||
|
if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
|
||||||
|
print = true;
|
||||||
|
out.printf(" different, but Q0 base detected%n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (readHasSoftClip(read)) {
|
||||||
|
print = true;
|
||||||
|
out.printf(" different, but soft clip detected%n");
|
||||||
|
break;
|
||||||
|
} else if (readHasDeletion(read)) {
|
||||||
|
print = true;
|
||||||
|
out.printf(" different, but deletion detected%n");
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
fail = true; print = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! onlyPrintOnFailures || ( print && ( alsoPrintWarnings || fail ) ) ) {
|
||||||
|
out.printf(" read length : %d%n", read.getReadLength());
|
||||||
|
out.printf(" read start : %d%n", read.getAlignmentStart());
|
||||||
|
out.printf(" cigar : %s%n", read.getCigarString());
|
||||||
|
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||||
|
out.printf(" ref length : %d%n", baq.refBases.length);
|
||||||
|
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
|
||||||
|
printQuals(" BQ deltas : ", BAQ.getBAQDeltas(read), true);
|
||||||
|
printQuals(" original quals: ", read.getBaseQualities(), true);
|
||||||
|
printQuals(" original quals: ", read.getBaseQualities());
|
||||||
|
printQuals(" tag quals: ", baqFromTag);
|
||||||
|
printQuals(" hmm quals: ", baq.bq);
|
||||||
|
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if ( fail )
|
||||||
|
throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
|
||||||
|
read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static boolean readHasSoftClip(SAMRecord read) {
|
||||||
|
for (CigarElement e : read.getCigar().getCigarElements()) {
|
||||||
|
if ( e.getOperator() == CigarOperator.SOFT_CLIP )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static boolean readHasDeletion(SAMRecord read) {
|
||||||
|
for (CigarElement e : read.getCigar().getCigarElements()) {
|
||||||
|
if ( e.getOperator() == CigarOperator.DELETION )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void printQuals( String prefix, byte[] quals ) {
|
||||||
|
printQuals(prefix, quals, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void printQuals( String prefix, byte[] quals, boolean asInt ) {
|
||||||
|
out.print(prefix);
|
||||||
|
for ( int i = 0; i < quals.length; i++) {
|
||||||
|
if ( asInt ) {
|
||||||
|
out.print((int)quals[i]);
|
||||||
|
if ( i+1 != quals.length ) out.print(",");
|
||||||
|
} else
|
||||||
|
out.print((char)(quals[i]+33));
|
||||||
|
}
|
||||||
|
out.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
|
return value + sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,490 @@
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.CigarOperator;
|
||||||
|
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||||
|
import net.sf.picard.reference.ReferenceSequence;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
/*
|
||||||
|
The topology of the profile HMM:
|
||||||
|
|
||||||
|
/\ /\ /\ /\
|
||||||
|
I[1] I[k-1] I[k] I[L]
|
||||||
|
^ \ \ ^ \ ^ \ \ ^
|
||||||
|
| \ \ | \ | \ \ |
|
||||||
|
M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
|
||||||
|
\ \/ \/ \/ /
|
||||||
|
\ /\ /\ /\ /
|
||||||
|
-> D[k-1] -> D[k] ->
|
||||||
|
|
||||||
|
M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
|
||||||
|
|
||||||
|
On input, _ref is the reference sequence and _query is the query
|
||||||
|
sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
|
||||||
|
ambiguous residue. iqual is the base quality. c sets the gap open
|
||||||
|
probability, gap extension probability and band width.
|
||||||
|
|
||||||
|
On output, state and q are arrays of length l_query. The higher 30
|
||||||
|
bits give the reference position the query base is matched to and the
|
||||||
|
lower two bits can be 0 (an alignment match) or 1 (an
|
||||||
|
insertion). q[i] gives the phred scaled posterior probability of
|
||||||
|
state[i] being wrong.
|
||||||
|
*/
|
||||||
|
public class BAQ {
|
||||||
|
private final static boolean DEBUG = false;
|
||||||
|
|
||||||
|
public enum Mode {
|
||||||
|
NONE, // don't apply a BAQ at all, the default
|
||||||
|
USE_TAG_ONLY, // only use the TAG, if present, in the reads
|
||||||
|
CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag
|
||||||
|
RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final String BAQ_TAG = "BQ";
|
||||||
|
|
||||||
|
private static double[] qual2prob = new double[256];
|
||||||
|
static {
|
||||||
|
for (int i = 0; i < 256; ++i)
|
||||||
|
qual2prob[i] = Math.pow(10, -i/10.);
|
||||||
|
}
|
||||||
|
|
||||||
|
private double cd = 1e-3; // gap open probility [1e-3]
|
||||||
|
private double ce = 0.1; // gap extension probability [0.1]
|
||||||
|
private int cb = 7; // band width [7]
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Any bases with Q < MIN_BASE_QUAL are raised up to this base quality
|
||||||
|
*/
|
||||||
|
private int minBaseQual = 4;
|
||||||
|
|
||||||
|
public double getGapOpenProb() {
|
||||||
|
return cd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getGapExtensionProb() {
|
||||||
|
return ce;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBandWidth() {
|
||||||
|
return cb;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use defaults for everything
|
||||||
|
*/
|
||||||
|
public BAQ() { }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new HmmGlocal object with specified parameters
|
||||||
|
*
|
||||||
|
* @param d gap open prob.
|
||||||
|
* @param e gap extension prob.
|
||||||
|
* @param b band width
|
||||||
|
* @param minBaseQual All bases with Q < minBaseQual are up'd to this value
|
||||||
|
*/
|
||||||
|
public BAQ(final double d, final double e, final int b, final int minBaseQual) {
|
||||||
|
cd = d; ce = e; cb = b; this.minBaseQual = minBaseQual;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static double EM = 0.33333333333;
|
||||||
|
private final static double EI = 0.25;
|
||||||
|
|
||||||
|
// ####################################################################################################
|
||||||
|
//
|
||||||
|
// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY. CHANGES TO THIS CODE SHOULD BE
|
||||||
|
// NOTE -- PUSHED BACK TO HENG LI
|
||||||
|
//
|
||||||
|
// Note that _ref and _query are in the special 0-4 encoding [see above for docs]
|
||||||
|
//
|
||||||
|
// ####################################################################################################
|
||||||
|
public int hmm_glocal(final byte[] _ref, final byte[] _query, final byte[] _iqual, int[] state, byte[] q) {
|
||||||
|
if ( _ref == null ) throw new ReviewedStingException("BUG: ref sequence is null");
|
||||||
|
if ( _query == null ) throw new ReviewedStingException("BUG: query sequence is null");
|
||||||
|
if ( _iqual == null ) throw new ReviewedStingException("BUG: query quality vector is null");
|
||||||
|
if ( _query.length != _iqual.length ) throw new ReviewedStingException("BUG: read sequence length != qual length");
|
||||||
|
if ( q != null && q.length != _query.length ) throw new ReviewedStingException("BUG: BAQ quality length != read sequence length");
|
||||||
|
if ( state != null && state.length != _query.length ) throw new ReviewedStingException("BUG: state length != read sequence length");
|
||||||
|
|
||||||
|
int i, k;
|
||||||
|
|
||||||
|
/*** initialization ***/
|
||||||
|
// change coordinates
|
||||||
|
int l_ref = _ref.length;
|
||||||
|
byte[] ref = new byte[l_ref+1];
|
||||||
|
for (i = 0; i < l_ref; ++i) ref[i+1] = _ref[i]; // FIXME: this is silly...
|
||||||
|
int l_query = _query.length;
|
||||||
|
byte[] query = new byte[l_query+1];
|
||||||
|
double[] qual = new double[l_query+1];
|
||||||
|
for (i = 0; i < l_query; ++i) {
|
||||||
|
query[i+1] = _query[i];
|
||||||
|
qual[i+1] = qual2prob[_iqual[i] < minBaseQual ? minBaseQual : _iqual[i]];
|
||||||
|
}
|
||||||
|
|
||||||
|
// set band width
|
||||||
|
int bw2, bw = l_ref > l_query? l_ref : l_query;
|
||||||
|
if (bw > cb) bw = cb;
|
||||||
|
if (bw < Math.abs(l_ref - l_query)) bw = Math.abs(l_ref - l_query);
|
||||||
|
bw2 = bw * 2 + 1;
|
||||||
|
|
||||||
|
// allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
|
||||||
|
double[][] f = new double[l_query+1][bw2*3 + 6];
|
||||||
|
double[][] b = new double[l_query+1][bw2*3 + 6];
|
||||||
|
double[] s = new double[l_query+2];
|
||||||
|
|
||||||
|
// initialize transition probabilities
|
||||||
|
double sM, sI, bM, bI;
|
||||||
|
sM = sI = 1. / (2 * l_query + 2);
|
||||||
|
bM = (1 - cd) / l_query; bI = cd / l_query; // (bM+bI)*l_query==1
|
||||||
|
double[] m = new double[9];
|
||||||
|
m[0*3+0] = (1 - cd - cd) * (1 - sM); m[0*3+1] = m[0*3+2] = cd * (1 - sM);
|
||||||
|
m[1*3+0] = (1 - ce) * (1 - sI); m[1*3+1] = ce * (1 - sI); m[1*3+2] = 0.;
|
||||||
|
m[2*3+0] = 1 - ce; m[2*3+1] = 0.; m[2*3+2] = ce;
|
||||||
|
|
||||||
|
/*** forward ***/
|
||||||
|
// f[0]
|
||||||
|
f[0][set_u(bw, 0, 0)] = s[0] = 1.;
|
||||||
|
{ // f[1]
|
||||||
|
double[] fi = f[1];
|
||||||
|
double sum;
|
||||||
|
int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
|
||||||
|
for (k = beg, sum = 0.; k <= end; ++k) {
|
||||||
|
int u;
|
||||||
|
double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
|
||||||
|
u = set_u(bw, 1, k);
|
||||||
|
fi[u+0] = e * bM; fi[u+1] = EI * bI;
|
||||||
|
sum += fi[u] + fi[u+1];
|
||||||
|
}
|
||||||
|
// rescale
|
||||||
|
s[1] = sum;
|
||||||
|
_beg = set_u(bw, 1, beg); _end = set_u(bw, 1, end); _end += 2;
|
||||||
|
for (k = _beg; k <= _end; ++k) fi[k] /= sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// f[2..l_query]
|
||||||
|
for (i = 2; i <= l_query; ++i) {
|
||||||
|
double[] fi = f[i], fi1 = f[i-1];
|
||||||
|
double sum, qli = qual[i];
|
||||||
|
int beg = 1, end = l_ref, x, _beg, _end;
|
||||||
|
byte qyi = query[i];
|
||||||
|
x = i - bw; beg = beg > x? beg : x; // band start
|
||||||
|
x = i + bw; end = end < x? end : x; // band end
|
||||||
|
for (k = beg, sum = 0.; k <= end; ++k) {
|
||||||
|
int u, v11, v01, v10;
|
||||||
|
double e;
|
||||||
|
e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
|
||||||
|
u = set_u(bw, i, k); v11 = set_u(bw, i-1, k-1); v10 = set_u(bw, i-1, k); v01 = set_u(bw, i, k-1);
|
||||||
|
fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
|
||||||
|
fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
|
||||||
|
fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
|
||||||
|
sum += fi[u] + fi[u+1] + fi[u+2];
|
||||||
|
//System.out.println("("+i+","+k+";"+u+"): "+fi[u]+","+fi[u+1]+","+fi[u+2]);
|
||||||
|
}
|
||||||
|
// rescale
|
||||||
|
s[i] = sum;
|
||||||
|
_beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
|
||||||
|
for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
|
||||||
|
}
|
||||||
|
{ // f[l_query+1]
|
||||||
|
double sum;
|
||||||
|
for (k = 1, sum = 0.; k <= l_ref; ++k) {
|
||||||
|
int u = set_u(bw, l_query, k);
|
||||||
|
if (u < 3 || u >= bw2*3+3) continue;
|
||||||
|
sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
|
||||||
|
}
|
||||||
|
s[l_query+1] = sum; // the last scaling factor
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*** backward ***/
|
||||||
|
// b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
|
||||||
|
for (k = 1; k <= l_ref; ++k) {
|
||||||
|
int u = set_u(bw, l_query, k);
|
||||||
|
double[] bi = b[l_query];
|
||||||
|
if (u < 3 || u >= bw2*3+3) continue;
|
||||||
|
bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
|
||||||
|
}
|
||||||
|
// b[l_query-1..1]
|
||||||
|
for (i = l_query - 1; i >= 1; --i) {
|
||||||
|
int beg = 1, end = l_ref, x, _beg, _end;
|
||||||
|
double[] bi = b[i], bi1 = b[i+1];
|
||||||
|
double y = (i > 1)? 1. : 0., qli1 = qual[i+1];
|
||||||
|
byte qyi1 = query[i+1];
|
||||||
|
x = i - bw; beg = beg > x? beg : x;
|
||||||
|
x = i + bw; end = end < x? end : x;
|
||||||
|
for (k = end; k >= beg; --k) {
|
||||||
|
int u, v11, v01, v10;
|
||||||
|
double e;
|
||||||
|
u = set_u(bw, i, k); v11 = set_u(bw, i+1, k+1); v10 = set_u(bw, i+1, k); v01 = set_u(bw, i, k+1);
|
||||||
|
e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
|
||||||
|
bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
|
||||||
|
bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
|
||||||
|
bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
|
||||||
|
}
|
||||||
|
// rescale
|
||||||
|
_beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
|
||||||
|
for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO -- this appears to be a null operation overall. For debugging only?
|
||||||
|
double pb;
|
||||||
|
{ // b[0]
|
||||||
|
int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
|
||||||
|
double sum = 0.;
|
||||||
|
for (k = end; k >= beg; --k) {
|
||||||
|
int u = set_u(bw, 1, k);
|
||||||
|
double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
|
||||||
|
if (u < 3 || u >= bw2*3+3) continue;
|
||||||
|
sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
|
||||||
|
}
|
||||||
|
pb = b[0][set_u(bw, 0, 0)] = sum / s[0]; // if everything works as is expected, pb == 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
/*** MAP ***/
|
||||||
|
for (i = 1; i <= l_query; ++i) {
|
||||||
|
double sum = 0., max = 0.;
|
||||||
|
double[] fi = f[i], bi = b[i];
|
||||||
|
int beg = 1, end = l_ref, x, max_k = -1;
|
||||||
|
x = i - bw; beg = beg > x? beg : x;
|
||||||
|
x = i + bw; end = end < x? end : x;
|
||||||
|
for (k = beg; k <= end; ++k) {
|
||||||
|
int u = set_u(bw, i, k);
|
||||||
|
double z;
|
||||||
|
sum += (z = fi[u+0] * bi[u+0]); if (z > max) { max = z; max_k = (k-1)<<2 | 0; }
|
||||||
|
sum += (z = fi[u+1] * bi[u+1]); if (z > max) { max = z; max_k = (k-1)<<2 | 1; }
|
||||||
|
}
|
||||||
|
max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
|
||||||
|
if (state != null) state[i-1] = max_k;
|
||||||
|
if (q != null) {
|
||||||
|
k = (int)(-4.343 * Math.log(1. - max) + .499);
|
||||||
|
q[i-1] = (byte)(k > 100? 99 : k);
|
||||||
|
}
|
||||||
|
//System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* helper routine for hmm_glocal
|
||||||
|
*
|
||||||
|
* @param b
|
||||||
|
* @param i
|
||||||
|
* @param k
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private static int set_u(final int b, final int i, final int k) {
|
||||||
|
int x = i - b;
|
||||||
|
x = x > 0 ? x : 0;
|
||||||
|
return (k + 1 - x) * 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] bases2indices(byte[] bases) {
|
||||||
|
byte[] out = new byte[bases.length];
|
||||||
|
|
||||||
|
for ( int i = 0; i < bases.length; i++ ) {
|
||||||
|
switch (bases[i]) {
|
||||||
|
case 'A': case 'a': out[i] = 0; break;
|
||||||
|
case 'C': case 'c': out[i] = 1; break;
|
||||||
|
case 'G': case 'g': out[i] = 2; break;
|
||||||
|
case 'T': case 't': out[i] = 3; break;
|
||||||
|
default: out[i] = 4; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present.
|
||||||
|
* @param read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static byte[] getBAQTag(SAMRecord read) {
|
||||||
|
String s = read.getStringAttribute(BAQ_TAG);
|
||||||
|
return s != null ? s.getBytes() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
|
||||||
|
* @param read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static byte[] getBAQDeltas(SAMRecord read) {
|
||||||
|
byte[] baq = getBAQTag(read);
|
||||||
|
if ( baq != null ) {
|
||||||
|
byte[] deltas = new byte[baq.length];
|
||||||
|
for ( int i = 0; i < deltas.length; i++)
|
||||||
|
deltas[i] = (byte)(-1 * (baq[i] - 64));
|
||||||
|
return deltas;
|
||||||
|
} else
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the read has a BAQ tag, or false otherwise
|
||||||
|
* @param read
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static boolean hasBAQTag(SAMRecord read) {
|
||||||
|
return read.getStringAttribute(BAQ_TAG) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a new qual array for read that includes the BAQ adjusted. Does not support on-the-fly BAQ calculation
|
||||||
|
*
|
||||||
|
* @param read the SAMRecord to operate on
|
||||||
|
* @param overwriteOriginalQuals If true, we replace the original qualities scores in the read with their BAQ'd version
|
||||||
|
* @param useRawQualsIfNoBAQTag If useRawQualsIfNoBAQTag is true, then if there's no BAQ annotation we just use the raw quality scores. Throws IllegalStateException is false and no BAQ tag is present
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static byte[] calcBAQFromTag(SAMRecord read, boolean overwriteOriginalQuals, boolean useRawQualsIfNoBAQTag) {
|
||||||
|
byte[] rawQuals = read.getBaseQualities();
|
||||||
|
byte[] newQuals = rawQuals;
|
||||||
|
byte[] baq = getBAQTag(read);
|
||||||
|
|
||||||
|
if ( baq != null ) {
|
||||||
|
// Offset to base alignment quality (BAQ), of the same length as the read sequence.
|
||||||
|
// At the i-th read base, BAQi = Qi ? (BQi ? 64) where Qi is the i-th base quality.
|
||||||
|
newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length];
|
||||||
|
for ( int i = 0; i < rawQuals.length; i++) {
|
||||||
|
int val = rawQuals[i] - (baq[i] - 64);
|
||||||
|
newQuals[i] = val < 0 ? 0 : (byte)val;
|
||||||
|
}
|
||||||
|
} else if ( ! useRawQualsIfNoBAQTag ) {
|
||||||
|
throw new IllegalStateException("Required BAQ tag to be present, but none was on read " + read.getReadName());
|
||||||
|
}
|
||||||
|
|
||||||
|
return newQuals;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class BAQCalculationResult {
|
||||||
|
public byte[] refBases, rawQuals, readBases, bq;
|
||||||
|
public int refOffset;
|
||||||
|
public int[] state;
|
||||||
|
|
||||||
|
// todo -- optimization: use static growing arrays here
|
||||||
|
public BAQCalculationResult(SAMRecord read, byte[] ref, int refOffset) {
|
||||||
|
// prepares data for calculation
|
||||||
|
rawQuals = read.getBaseQualities();
|
||||||
|
readBases = read.getReadBases();
|
||||||
|
|
||||||
|
// now actually prepare the data structures, and fire up the hmm
|
||||||
|
bq = new byte[rawQuals.length];
|
||||||
|
state = new int[rawQuals.length];
|
||||||
|
this.refBases = ref;
|
||||||
|
this.refOffset = refOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int getFirstInsertionOffset(SAMRecord read) {
|
||||||
|
CigarElement e = read.getCigar().getCigarElement(0);
|
||||||
|
if ( e.getOperator() == CigarOperator.I )
|
||||||
|
return e.getLength();
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int getLastInsertionOffset(SAMRecord read) {
|
||||||
|
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1);
|
||||||
|
if ( e.getOperator() == CigarOperator.I )
|
||||||
|
return e.getLength();
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
|
||||||
|
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
|
||||||
|
int offset = getBandWidth() / 2;
|
||||||
|
int start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read);
|
||||||
|
int stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read);
|
||||||
|
|
||||||
|
// now that we have the start and stop, get the reference sequence covering it
|
||||||
|
ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
|
||||||
|
|
||||||
|
return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// we need to bad ref by at least the bandwidth / 2 on either side
|
||||||
|
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
|
||||||
|
// note -- assumes ref is offset from the *CLIPPED* start
|
||||||
|
BAQCalculationResult baqResult = new BAQCalculationResult(read, ref, refOffset);
|
||||||
|
byte[] convSeq = bases2indices(baqResult.readBases);
|
||||||
|
byte[] convRef = bases2indices(baqResult.refBases);
|
||||||
|
|
||||||
|
hmm_glocal(convRef, convSeq, baqResult.rawQuals, baqResult.state, baqResult.bq);
|
||||||
|
|
||||||
|
// cap quals
|
||||||
|
int readI = 0, refI = 0;
|
||||||
|
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
|
||||||
|
int l = elt.getLength();
|
||||||
|
switch (elt.getOperator()) {
|
||||||
|
case N: // cannot handle these
|
||||||
|
return null;
|
||||||
|
case H : case P : // ignore pads and hard clips
|
||||||
|
break;
|
||||||
|
case I : case S :
|
||||||
|
// todo -- is it really the case that we want to treat I and S the same?
|
||||||
|
for ( int i = readI; i < readI + l; i++ ) baqResult.bq[i] = baqResult.rawQuals[i];
|
||||||
|
readI += l;
|
||||||
|
break;
|
||||||
|
case D : refI += l; break;
|
||||||
|
case M :
|
||||||
|
for (int i = readI; i < readI + l; i++) {
|
||||||
|
boolean isIndel = stateIsIndel(baqResult.state[i]);
|
||||||
|
int pos = stateAlignedPosition(baqResult.state[i]);
|
||||||
|
int expectedPos = refI - refOffset + (i - readI);
|
||||||
|
if ( isIndel || pos != expectedPos )
|
||||||
|
// we are an indel or we don't align to our best current position
|
||||||
|
baqResult.bq[i] = 0;
|
||||||
|
else
|
||||||
|
baqResult.bq[i] = baqResult.bq[i] < baqResult.rawQuals[i] ? baqResult.bq[i] : baqResult.rawQuals[i];
|
||||||
|
}
|
||||||
|
readI += l; refI += l;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return baqResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Modifies read in place so that the base quality scores are capped by the BAQ calculation. Uses the BAQ
|
||||||
|
* tag if present already and alwaysRecalculate is false, otherwise fires up the HMM and does the BAQ on the fly
|
||||||
|
* using the refReader to obtain the reference bases as needed.
|
||||||
|
*
|
||||||
|
* @param read
|
||||||
|
* @param refReader
|
||||||
|
* @param calculationType
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public SAMRecord baqRead(SAMRecord read, IndexedFastaSequenceFile refReader, Mode calculationType ) {
|
||||||
|
if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());
|
||||||
|
if ( calculationType == Mode.NONE ) { // we don't want to do anything
|
||||||
|
; // just fall though
|
||||||
|
} else if ( calculationType == Mode.USE_TAG_ONLY ) {
|
||||||
|
calcBAQFromTag(read, true, true);
|
||||||
|
} else {
|
||||||
|
if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) ) {
|
||||||
|
if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n");
|
||||||
|
BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader);
|
||||||
|
System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
|
||||||
|
} else {
|
||||||
|
if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n");
|
||||||
|
// this overwrites the original qualities
|
||||||
|
calcBAQFromTag(read, true, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean stateIsIndel(int state) {
|
||||||
|
return (state & 3) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int stateAlignedPosition(int state) {
|
||||||
|
return state >> 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -379,6 +379,10 @@ public class MathUtils {
|
||||||
return array[minElementIndex(array)];
|
return array[minElementIndex(array)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static byte arrayMin(byte[] array) {
|
||||||
|
return array[minElementIndex(array)];
|
||||||
|
}
|
||||||
|
|
||||||
public static int minElementIndex(double[] array) {
|
public static int minElementIndex(double[] array) {
|
||||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||||
|
|
||||||
|
|
@ -391,6 +395,18 @@ public class MathUtils {
|
||||||
return minI;
|
return minI;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int minElementIndex(byte[] array) {
|
||||||
|
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||||
|
|
||||||
|
int minI = -1;
|
||||||
|
for ( int i = 0; i < array.length; i++ ) {
|
||||||
|
if ( minI == -1 || array[i] < array[minI] )
|
||||||
|
minI = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
return minI;
|
||||||
|
}
|
||||||
|
|
||||||
public static int arrayMax(List<Integer> array) {
|
public static int arrayMax(List<Integer> array) {
|
||||||
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
|
||||||
if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!");
|
if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!");
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.BAQ;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||||
import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
|
@ -146,7 +147,8 @@ public class LocusIteratorByStateUnitTest extends BaseTest {
|
||||||
new ValidationExclusion(),
|
new ValidationExclusion(),
|
||||||
new ArrayList<SamRecordFilter>(),
|
new ArrayList<SamRecordFilter>(),
|
||||||
false,
|
false,
|
||||||
false
|
false,
|
||||||
|
BAQ.Mode.NONE, null // no BAQ
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue