Merge remote-tracking branch 'unstable/master'
This commit is contained in:
commit
b0d68eb0e3
|
|
@ -847,7 +847,6 @@
|
|||
|
||||
|
||||
<path id="testng.default.classpath">
|
||||
<path refid="external.dependencies" />
|
||||
<pathelement location="${java.classes}" />
|
||||
<pathelement location="${scala.classes}" />
|
||||
<pathelement location="${java.contracts}" />
|
||||
|
|
@ -858,6 +857,7 @@
|
|||
<pathelement location="${R.tar.dir}" />
|
||||
<pathelement location="${R.public.scripts.dir}" />
|
||||
<pathelement location="${R.private.scripts.dir}" />
|
||||
<path refid="external.dependencies" />
|
||||
</path>
|
||||
|
||||
<path id="testng.gatk.releasetest.classpath">
|
||||
|
|
@ -1118,6 +1118,11 @@
|
|||
<patternset refid="dependency.mask" />
|
||||
</fileset>
|
||||
</unjar>
|
||||
|
||||
<!-- HACK: The GATK jar itself contains overrides for some core classes. Make sure the GATK.jar is unrolled last. -->
|
||||
<unjar dest="${staging.dir}" overwrite="true">
|
||||
<fileset dir="${dist.dir}" includes="**/GenomeAnalysisTK.jar"/>
|
||||
</unjar>
|
||||
</target>
|
||||
|
||||
<!-- Build a package consisting of all supporting files -->
|
||||
|
|
|
|||
2
ivy.xml
2
ivy.xml
|
|
@ -76,7 +76,7 @@
|
|||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
||||
|
||||
<!-- snpEff annotator for pipelines -->
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.2" />
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.4rc3" />
|
||||
|
||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||
<exclude org="javax.servlet" />
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ if ( onCMDLine ) {
|
|||
inputFileName = args[1]
|
||||
outputPDF = args[2]
|
||||
} else {
|
||||
inputFileName = "~/Desktop/broadLocal/GATK/unstable/wgs.jobreport.txt"
|
||||
inputFileName = "Q-26618@gsa4.jobreport.txt"
|
||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
|
||||
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
|
||||
outputPDF = NA
|
||||
|
|
@ -129,9 +129,11 @@ plotGroup <- function(groupTable) {
|
|||
# as above, but averaging over all iterations
|
||||
groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
|
||||
if ( dim(sub)[1] > 1 ) {
|
||||
sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
|
||||
textplot(as.data.frame(sum), show.rownames=F)
|
||||
title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
|
||||
try({ # need a try here because we will fail to reduce when there's just a single iteration
|
||||
sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
|
||||
textplot(as.data.frame(sum), show.rownames=F)
|
||||
title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
|
||||
}, silent=T)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -149,6 +151,35 @@ convertUnits <- function(gatkReportData) {
|
|||
lapply(gatkReportData, convertGroup)
|
||||
}
|
||||
|
||||
#
|
||||
# Plots runtimes by analysis name and exechosts
|
||||
#
|
||||
# Useful to understand the performance of analysis jobs by hosts,
|
||||
# and to debug problematic nodes
|
||||
#
|
||||
plotTimeByHost <- function(gatkReportData) {
|
||||
fields = c("analysisName", "exechosts", "runtime")
|
||||
|
||||
runtimes = data.frame()
|
||||
for ( report in gatkReportData ) {
|
||||
runtimes = rbind(runtimes, report[,fields])
|
||||
}
|
||||
|
||||
plotMe <- function(name, vis) {
|
||||
p = ggplot(data=runtimes, aes(x=exechosts, y=runtime, group=exechosts, color=exechosts))
|
||||
p = p + facet_grid(analysisName ~ ., scale="free")
|
||||
p = p + vis()
|
||||
p = p + xlab("Job execution host")
|
||||
p = p + opts(title = paste(name, "of job runtimes by analysis name and execution host"))
|
||||
p = p + ylab(paste("Distribution of runtimes", RUNTIME_UNITS))
|
||||
p = p + opts(axis.text.x=theme_text(angle=45, hjust=1, vjust=1))
|
||||
print(p)
|
||||
}
|
||||
|
||||
plotMe("Boxplot", geom_boxplot)
|
||||
plotMe("Jittered points", geom_jitter)
|
||||
}
|
||||
|
||||
|
||||
# read the table
|
||||
gatkReportData <- gsa.read.gatkreport(inputFileName)
|
||||
|
|
@ -162,7 +193,9 @@ if ( ! is.na(outputPDF) ) {
|
|||
plotJobsGantt(gatkReportData, T, F)
|
||||
plotJobsGantt(gatkReportData, F, F)
|
||||
plotProgressByTime(gatkReportData)
|
||||
plotTimeByHost(gatkReportData)
|
||||
for ( group in gatkReportData ) {
|
||||
print(group)
|
||||
plotGroup(group)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,247 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.picard.sam;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.util.*;
|
||||
import java.lang.reflect.Constructor;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
|
||||
/**
|
||||
* Provides an iterator interface for merging multiple underlying iterators into a single
|
||||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||
* the requested output format is unsorted, in which case any combination is valid.
|
||||
*/
|
||||
public class MergingSamRecordIterator implements CloseableIterator<SAMRecord> {
|
||||
private final PriorityQueue<ComparableSamRecordIterator> pq;
|
||||
private final SamFileHeaderMerger samHeaderMerger;
|
||||
private final Collection<SAMFileReader> readers;
|
||||
private final SAMFileHeader.SortOrder sortOrder;
|
||||
private final SAMRecordComparator comparator;
|
||||
|
||||
private boolean initialized = false;
|
||||
private boolean iterationStarted = false;
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
* provided by the header merger parameter.
|
||||
* @param headerMerger The merged header and contents of readers.
|
||||
* @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order.
|
||||
* @deprecated replaced by (SamFileHeaderMerger, Collection<SAMFileReader>, boolean)
|
||||
*/
|
||||
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) {
|
||||
this(headerMerger, headerMerger.getReaders(), forcePresorted);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
* provided by the header merger parameter.
|
||||
* @param headerMerger The merged header and contents of readers.
|
||||
* @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order.
|
||||
*/
|
||||
public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection<SAMFileReader> readers, final boolean assumeSorted) {
|
||||
this.samHeaderMerger = headerMerger;
|
||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||
this.comparator = getComparator();
|
||||
this.readers = readers;
|
||||
|
||||
this.pq = new PriorityQueue<ComparableSamRecordIterator>(readers.size());
|
||||
|
||||
for (final SAMFileReader reader : readers) {
|
||||
if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted &&
|
||||
reader.getFileHeader().getSortOrder() != this.sortOrder){
|
||||
throw new PicardException("Files are not compatible with sort order");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval,
|
||||
* rather than iterating over every read in the backing file or stream.
|
||||
* @param reader Reader to add to the merging iterator.
|
||||
* @param iterator Iterator traversing over reader contents.
|
||||
*/
|
||||
public void addIterator(final SAMFileReader reader, final CloseableIterator<SAMRecord> iterator) {
|
||||
if(iterationStarted)
|
||||
throw new PicardException("Cannot add another iterator; iteration has already begun");
|
||||
if(!samHeaderMerger.containsHeader(reader.getFileHeader()))
|
||||
throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger");
|
||||
final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator);
|
||||
addIfNotEmpty(comparableIterator);
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
private void startIterationIfRequired() {
|
||||
if(initialized)
|
||||
return;
|
||||
for(SAMFileReader reader: readers)
|
||||
addIterator(reader,reader.iterator());
|
||||
iterationStarted = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close down all open iterators.
|
||||
*/
|
||||
public void close() {
|
||||
// Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue.
|
||||
for(CloseableIterator<SAMRecord> iterator: pq)
|
||||
iterator.close();
|
||||
}
|
||||
|
||||
/** Returns true if any of the underlying iterators has more records, otherwise false. */
|
||||
public boolean hasNext() {
|
||||
startIterationIfRequired();
|
||||
return !this.pq.isEmpty();
|
||||
}
|
||||
|
||||
/** Returns the next record from the top most iterator during merging. */
|
||||
public SAMRecord next() {
|
||||
startIterationIfRequired();
|
||||
|
||||
final ComparableSamRecordIterator iterator = this.pq.poll();
|
||||
final SAMRecord record = iterator.next();
|
||||
addIfNotEmpty(iterator);
|
||||
record.setHeader(this.samHeaderMerger.getMergedHeader());
|
||||
|
||||
// Fix the read group if needs be
|
||||
if (this.samHeaderMerger.hasReadGroupCollisions()) {
|
||||
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
|
||||
if (oldGroupId != null ) {
|
||||
final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId);
|
||||
record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix the program group if needs be
|
||||
if (this.samHeaderMerger.hasProgramGroupCollisions()) {
|
||||
final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID);
|
||||
if (oldGroupId != null ) {
|
||||
final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId);
|
||||
record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId);
|
||||
}
|
||||
}
|
||||
|
||||
// Fix up the sequence indexes if needs be
|
||||
if (this.samHeaderMerger.hasMergedSequenceDictionary()) {
|
||||
if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex()));
|
||||
}
|
||||
|
||||
if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex()));
|
||||
}
|
||||
}
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds iterator to priority queue. If the iterator has more records it is added
|
||||
* otherwise it is closed and not added.
|
||||
*/
|
||||
private void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
|
||||
if (iterator.hasNext()) {
|
||||
pq.offer(iterator);
|
||||
}
|
||||
else {
|
||||
iterator.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Unsupported operation. */
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the right comparator for a given sort order (coordinate, alphabetic). In the
|
||||
* case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
|
||||
* ordering.
|
||||
*/
|
||||
private SAMRecordComparator getComparator() {
|
||||
// For unsorted build a fake comparator that compares based on object ID
|
||||
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
|
||||
return new SAMRecordComparator() {
|
||||
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
|
||||
}
|
||||
|
||||
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
return fileOrderCompare(lhs, rhs);
|
||||
}
|
||||
};
|
||||
}
|
||||
if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) {
|
||||
return new MergedSequenceDictionaryCoordinateOrderComparator();
|
||||
}
|
||||
|
||||
// Otherwise try and figure out what kind of comparator to return and build it
|
||||
return this.sortOrder.getComparatorInstance();
|
||||
}
|
||||
|
||||
/** Returns the merged header that the merging iterator is working from. */
|
||||
public SAMFileHeader getMergedHeader() {
|
||||
return this.samHeaderMerger.getMergedHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged
|
||||
* sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids
|
||||
* more copy & paste.
|
||||
*/
|
||||
private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator {
|
||||
|
||||
public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) {
|
||||
final int referenceIndex1 = getReferenceIndex(samRecord1);
|
||||
final int referenceIndex2 = getReferenceIndex(samRecord2);
|
||||
if (referenceIndex1 != referenceIndex2) {
|
||||
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return 1;
|
||||
} else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return -1;
|
||||
} else {
|
||||
return referenceIndex1 - referenceIndex2;
|
||||
}
|
||||
}
|
||||
if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
// Both are unmapped.
|
||||
return 0;
|
||||
}
|
||||
return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart();
|
||||
}
|
||||
|
||||
private int getReferenceIndex(final SAMRecord samRecord) {
|
||||
if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex());
|
||||
}
|
||||
if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex());
|
||||
}
|
||||
return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,744 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.picard.sam;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.samtools.AbstractSAMHeaderRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
|
||||
/**
|
||||
* Merges SAMFileHeaders that have the same sequences into a single merged header
|
||||
* object while providing read group translation for cases where read groups
|
||||
* clash across input headers.
|
||||
*/
|
||||
public class SamFileHeaderMerger {
|
||||
//Super Header to construct
|
||||
private final SAMFileHeader mergedHeader;
|
||||
private Collection<SAMFileReader> readers;
|
||||
private final Collection<SAMFileHeader> headers;
|
||||
|
||||
//Translation of old group ids to new group ids
|
||||
private final Map<SAMFileHeader, Map<String, String>> samReadGroupIdTranslation =
|
||||
new IdentityHashMap<SAMFileHeader, Map<String, String>>();
|
||||
|
||||
//the read groups from different files use the same group ids
|
||||
private boolean hasReadGroupCollisions = false;
|
||||
|
||||
//the program records from different files use the same program record ids
|
||||
private boolean hasProgramGroupCollisions = false;
|
||||
|
||||
//Translation of old program group ids to new program group ids
|
||||
private Map<SAMFileHeader, Map<String, String>> samProgramGroupIdTranslation =
|
||||
new IdentityHashMap<SAMFileHeader, Map<String, String>>();
|
||||
|
||||
private boolean hasMergedSequenceDictionary = false;
|
||||
|
||||
// Translation of old sequence dictionary ids to new dictionary ids
|
||||
// This is an IdentityHashMap because it can be quite expensive to compute the hashCode for
|
||||
// large SAMFileHeaders. It is possible that two input files will have identical headers so that
|
||||
// the regular HashMap would fold them together, but the value stored in each of the two
|
||||
// Map entries will be the same, so it should not hurt anything.
|
||||
private final Map<SAMFileHeader, Map<Integer, Integer>> samSeqDictionaryIdTranslationViaHeader =
|
||||
new IdentityHashMap<SAMFileHeader, Map<Integer, Integer>>();
|
||||
|
||||
//HeaderRecordFactory that creates SAMReadGroupRecord instances.
|
||||
private static final HeaderRecordFactory<SAMReadGroupRecord> READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory<SAMReadGroupRecord>() {
|
||||
public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) {
|
||||
return new SAMReadGroupRecord(id, srcReadGroupRecord);
|
||||
}
|
||||
};
|
||||
|
||||
//HeaderRecordFactory that creates SAMProgramRecord instances.
|
||||
private static final HeaderRecordFactory<SAMProgramRecord> PROGRAM_RECORD_FACTORY = new HeaderRecordFactory<SAMProgramRecord>() {
|
||||
public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) {
|
||||
return new SAMProgramRecord(id, srcProgramRecord);
|
||||
}
|
||||
};
|
||||
|
||||
//comparator used to sort lists of program group and read group records
|
||||
private static final Comparator<AbstractSAMHeaderRecord> RECORD_ID_COMPARATOR = new Comparator<AbstractSAMHeaderRecord>() {
|
||||
public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) {
|
||||
return o1.getId().compareTo(o2.getId());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Create SAMFileHeader with additional information. Required that sequence dictionaries agree.
|
||||
*
|
||||
* @param readers sam file readers to combine
|
||||
* @param sortOrder sort order new header should have
|
||||
* @deprecated replaced by SamFileHeaderMerger(Collection<SAMFileHeader>, SAMFileHeader.SortOrder, boolean)
|
||||
*/
|
||||
public SamFileHeaderMerger(final Collection<SAMFileReader> readers, final SAMFileHeader.SortOrder sortOrder) {
|
||||
this(readers, sortOrder, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create SAMFileHeader with additional information.
|
||||
*
|
||||
* @param readers sam file readers to combine
|
||||
* @param sortOrder sort order new header should have
|
||||
* @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
|
||||
* all input sequence dictionaries be identical.
|
||||
* @deprecated replaced by SamFileHeaderMerger(Collection<SAMFileHeader>, SAMFileHeader.SortOrder, boolean)
|
||||
*/
|
||||
public SamFileHeaderMerger(final Collection<SAMFileReader> readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) {
|
||||
this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries);
|
||||
this.readers = readers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create SAMFileHeader with additional information.. This is the preferred constructor.
|
||||
*
|
||||
* @param sortOrder sort order new header should have
|
||||
* @param headers sam file headers to combine
|
||||
* @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that
|
||||
* all input sequence dictionaries be identical.
|
||||
*/
|
||||
public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection<SAMFileHeader> headers, final boolean mergeDictionaries) {
|
||||
this.headers = headers;
|
||||
this.mergedHeader = new SAMFileHeader();
|
||||
|
||||
SAMSequenceDictionary sequenceDictionary;
|
||||
try {
|
||||
sequenceDictionary = getSequenceDictionary(headers);
|
||||
this.hasMergedSequenceDictionary = false;
|
||||
}
|
||||
catch (SequenceUtil.SequenceListsDifferException pe) {
|
||||
if (mergeDictionaries) {
|
||||
sequenceDictionary = mergeSequenceDictionaries(headers);
|
||||
this.hasMergedSequenceDictionary = true;
|
||||
}
|
||||
else {
|
||||
throw pe;
|
||||
}
|
||||
}
|
||||
|
||||
this.mergedHeader.setSequenceDictionary(sequenceDictionary);
|
||||
|
||||
// Set program that creates input alignments
|
||||
for (final SAMProgramRecord program : mergeProgramGroups(headers)) {
|
||||
this.mergedHeader.addProgramRecord(program);
|
||||
}
|
||||
|
||||
// Set read groups for merged header
|
||||
final List<SAMReadGroupRecord> readGroups = mergeReadGroups(headers);
|
||||
this.mergedHeader.setReadGroups(readGroups);
|
||||
this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none);
|
||||
|
||||
this.mergedHeader.setSortOrder(sortOrder);
|
||||
|
||||
for (final SAMFileHeader header : headers) {
|
||||
for (final String comment : header.getComments()) {
|
||||
this.mergedHeader.addComment(comment);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Utilility method to make use with old constructor
|
||||
private static List<SAMFileHeader> getHeadersFromReaders(Collection<SAMFileReader> readers) {
|
||||
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(readers.size());
|
||||
for (SAMFileReader reader : readers) {
|
||||
headers.add(reader.getFileHeader());
|
||||
}
|
||||
return headers;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks to see if there are clashes where different readers are using the same read
|
||||
* group IDs. If yes, then those IDs that collided are remapped.
|
||||
*
|
||||
* @param headers headers to combine
|
||||
* @return new list of read groups constructed from all the readers
|
||||
*/
|
||||
private List<SAMReadGroupRecord> mergeReadGroups(final Collection<SAMFileHeader> headers) {
|
||||
//prepare args for mergeHeaderRecords(..) call
|
||||
final HashSet<String> idsThatAreAlreadyTaken = new HashSet<String>();
|
||||
|
||||
final List<HeaderRecordAndFileHeader<SAMReadGroupRecord>> readGroupsToProcess = new LinkedList<HeaderRecordAndFileHeader<SAMReadGroupRecord>>();
|
||||
for (final SAMFileHeader header : headers) {
|
||||
for (final SAMReadGroupRecord readGroup : header.getReadGroups()) {
|
||||
//verify that there are no existing id collisions in this input file
|
||||
if(!idsThatAreAlreadyTaken.add(readGroup.getId()))
|
||||
throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")");
|
||||
|
||||
readGroupsToProcess.add(new HeaderRecordAndFileHeader<SAMReadGroupRecord>(readGroup, header));
|
||||
}
|
||||
idsThatAreAlreadyTaken.clear();
|
||||
}
|
||||
|
||||
final List<SAMReadGroupRecord> result = new LinkedList<SAMReadGroupRecord>();
|
||||
|
||||
hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result);
|
||||
|
||||
//sort the result list by record id
|
||||
Collections.sort(result, RECORD_ID_COMPARATOR);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks to see if there are clashes where different readers are using the same program
|
||||
* group IDs. If yes, then those IDs that collided are remapped.
|
||||
*
|
||||
* @param headers headers to combine
|
||||
* @return new list of program groups constructed from all the readers
|
||||
*/
|
||||
private List<SAMProgramRecord> mergeProgramGroups(final Collection<SAMFileHeader> headers) {
|
||||
|
||||
final List<SAMProgramRecord> overallResult = new LinkedList<SAMProgramRecord>();
|
||||
|
||||
//this Set will accumulate all SAMProgramRecord ids that have been encountered so far.
|
||||
final HashSet<String> idsThatAreAlreadyTaken = new HashSet<String>();
|
||||
|
||||
//need to process all program groups
|
||||
List<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsLeftToProcess = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for (final SAMFileHeader header : headers) {
|
||||
for (final SAMProgramRecord programGroup : header.getProgramRecords()) {
|
||||
//verify that there are no existing id collisions in this input file
|
||||
if(!idsThatAreAlreadyTaken.add(programGroup.getId()))
|
||||
throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")");
|
||||
|
||||
programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader<SAMProgramRecord>(programGroup, header));
|
||||
}
|
||||
idsThatAreAlreadyTaken.clear();
|
||||
}
|
||||
|
||||
//A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to
|
||||
//another program group header (lets say ID=1 PN=A) to indicate that the given file was
|
||||
//processed by program A followed by program B. These PP attributes potentially
|
||||
//connect headers into one or more tree structures. Merging is done by
|
||||
//first merging all headers that don't have PP attributes (eg. tree roots),
|
||||
//then updating and merging all headers whose PPs point to the tree-root headers,
|
||||
//and so on until all program group headers are processed.
|
||||
|
||||
//currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots).
|
||||
List< HeaderRecordAndFileHeader<SAMProgramRecord> > currentProgramGroups = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for(final Iterator<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
|
||||
final HeaderRecordAndFileHeader<SAMProgramRecord> pair = programGroupsLeftToProcessIterator.next();
|
||||
if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) {
|
||||
programGroupsLeftToProcessIterator.remove();
|
||||
currentProgramGroups.add(pair);
|
||||
}
|
||||
}
|
||||
|
||||
//merge currentProgramGroups
|
||||
while(!currentProgramGroups.isEmpty())
|
||||
{
|
||||
final List<SAMProgramRecord> currentResult = new LinkedList<SAMProgramRecord>();
|
||||
|
||||
hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult);
|
||||
|
||||
//add currentResults to overallResults
|
||||
overallResult.addAll(currentResult);
|
||||
|
||||
//apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess
|
||||
currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false);
|
||||
programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true);
|
||||
|
||||
//find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups),
|
||||
//and move them to the list of programGroupsToProcessNext.
|
||||
LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsToProcessNext = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for(final Iterator<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) {
|
||||
final HeaderRecordAndFileHeader<SAMProgramRecord> pairLeftToProcess = programGroupsLeftToProcessIterator.next();
|
||||
final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
|
||||
//find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file)
|
||||
for(final HeaderRecordAndFileHeader<SAMProgramRecord> justProcessedPair : currentProgramGroups) {
|
||||
String idJustProcessed = justProcessedPair.getHeaderRecord().getId();
|
||||
if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) {
|
||||
programGroupsLeftToProcessIterator.remove();
|
||||
programGroupsToProcessNext.add(pairLeftToProcess);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
currentProgramGroups = programGroupsToProcessNext;
|
||||
}
|
||||
|
||||
//verify that all records were processed
|
||||
if(!programGroupsLeftToProcess.isEmpty()) {
|
||||
StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n");
|
||||
for( final HeaderRecordAndFileHeader<SAMProgramRecord> pair : programGroupsLeftToProcess ) {
|
||||
SAMProgramRecord record = pair.getHeaderRecord();
|
||||
errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n");
|
||||
}
|
||||
throw new PicardException(errorMsg.toString());
|
||||
}
|
||||
|
||||
//sort the result list by record id
|
||||
Collections.sort(overallResult, RECORD_ID_COMPARATOR);
|
||||
|
||||
return overallResult;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility method that takes a list of program groups and remaps all their
|
||||
* ids (including ppIds if requested) using the given idTranslationTable.
|
||||
*
|
||||
* NOTE: when remapping, this method creates new SAMProgramRecords and
|
||||
* doesn't mutate any records in the programGroups list.
|
||||
*
|
||||
* @param programGroups The program groups to translate.
|
||||
* @param idTranslationTable The translation table.
|
||||
* @param translatePpIds Whether ppIds should be translated as well.
|
||||
*
|
||||
* @return The list of translated records.
|
||||
*/
|
||||
private List<HeaderRecordAndFileHeader<SAMProgramRecord>> translateIds(
|
||||
List<HeaderRecordAndFileHeader<SAMProgramRecord>> programGroups,
|
||||
Map<SAMFileHeader, Map<String, String>> idTranslationTable,
|
||||
boolean translatePpIds) {
|
||||
|
||||
//go through programGroups and translate any IDs and PPs based on the idTranslationTable.
|
||||
List<HeaderRecordAndFileHeader<SAMProgramRecord>> result = new LinkedList<HeaderRecordAndFileHeader<SAMProgramRecord>>();
|
||||
for(final HeaderRecordAndFileHeader<SAMProgramRecord> pair : programGroups ) {
|
||||
final SAMProgramRecord record = pair.getHeaderRecord();
|
||||
final String id = record.getProgramGroupId();
|
||||
final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG);
|
||||
|
||||
final SAMFileHeader header = pair.getFileHeader();
|
||||
final Map<String, String> translations = idTranslationTable.get(header);
|
||||
|
||||
//see if one or both ids need to be translated
|
||||
SAMProgramRecord translatedRecord = null;
|
||||
if(translations != null)
|
||||
{
|
||||
String translatedId = translations.get( id );
|
||||
String translatedPpId = translatePpIds ? translations.get( ppId ) : null;
|
||||
|
||||
boolean needToTranslateId = translatedId != null && !translatedId.equals(id);
|
||||
boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId);
|
||||
|
||||
if(needToTranslateId && needToTranslatePpId) {
|
||||
translatedRecord = new SAMProgramRecord(translatedId, record);
|
||||
translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
|
||||
} else if(needToTranslateId) {
|
||||
translatedRecord = new SAMProgramRecord(translatedId, record);
|
||||
} else if(needToTranslatePpId) {
|
||||
translatedRecord = new SAMProgramRecord(id, record);
|
||||
translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId);
|
||||
}
|
||||
}
|
||||
|
||||
if(translatedRecord != null) {
|
||||
result.add(new HeaderRecordAndFileHeader<SAMProgramRecord>(translatedRecord, header));
|
||||
} else {
|
||||
result.add(pair); //keep the original record
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Utility method for merging a List of AbstractSAMHeaderRecords. If it finds
|
||||
* records that have identical ids and attributes, it will collapse them
|
||||
* into one record. If it finds records that have identical ids but
|
||||
* non-identical attributes, this is treated as a collision. When collision happens,
|
||||
* the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable.
|
||||
*
|
||||
* NOTE: Non-collided records also get recorded in the idTranslationTable as
|
||||
* old-id to old-id. This way, an idTranslationTable lookup should never return null.
|
||||
*
|
||||
* @param headerRecords The header records to merge.
|
||||
* @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord.
|
||||
* @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped.
|
||||
* @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id
|
||||
* mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as
|
||||
* old-id to old-id. This way, an idTranslationTable lookup should never return null.
|
||||
*
|
||||
* @param result The list of merged header records.
|
||||
*
|
||||
* @return True if there were collisions.
|
||||
*/
|
||||
private <RecordType extends AbstractSAMHeaderRecord> boolean mergeHeaderRecords(final List<HeaderRecordAndFileHeader<RecordType>> headerRecords, HeaderRecordFactory<RecordType> headerRecordFactory,
|
||||
final HashSet<String> idsThatAreAlreadyTaken, Map<SAMFileHeader, Map<String, String>> idTranslationTable, List<RecordType> result) {
|
||||
|
||||
//The outer Map bins the header records by their ids. The nested Map further collapses
|
||||
//header records which, in addition to having the same id, also have identical attributes.
|
||||
//In other words, each key in the nested map represents one or more
|
||||
//header records which have both identical ids and identical attributes. The List of
|
||||
//SAMFileHeaders keeps track of which readers these header record(s) came from.
|
||||
final Map<String, Map<RecordType, List<SAMFileHeader>>> idToRecord =
|
||||
new HashMap<String, Map<RecordType, List<SAMFileHeader>>>();
|
||||
|
||||
//Populate the idToRecord and seenIds data structures
|
||||
for (final HeaderRecordAndFileHeader<RecordType> pair : headerRecords) {
|
||||
final RecordType record = pair.getHeaderRecord();
|
||||
final SAMFileHeader header = pair.getFileHeader();
|
||||
final String recordId = record.getId();
|
||||
Map<RecordType, List<SAMFileHeader>> recordsWithSameId = idToRecord.get(recordId);
|
||||
if(recordsWithSameId == null) {
|
||||
recordsWithSameId = new LinkedHashMap<RecordType, List<SAMFileHeader>>();
|
||||
idToRecord.put(recordId, recordsWithSameId);
|
||||
}
|
||||
|
||||
List<SAMFileHeader> fileHeaders = recordsWithSameId.get(record);
|
||||
if(fileHeaders == null) {
|
||||
fileHeaders = new LinkedList<SAMFileHeader>();
|
||||
recordsWithSameId.put(record, fileHeaders);
|
||||
}
|
||||
|
||||
fileHeaders.add(header);
|
||||
}
|
||||
|
||||
//Resolve any collisions between header records by remapping their ids.
|
||||
boolean hasCollisions = false;
|
||||
for (final Map.Entry<String, Map<RecordType, List<SAMFileHeader>>> entry : idToRecord.entrySet() )
|
||||
{
|
||||
final String recordId = entry.getKey();
|
||||
final Map<RecordType, List<SAMFileHeader>> recordsWithSameId = entry.getValue();
|
||||
|
||||
|
||||
for( Map.Entry<RecordType, List<SAMFileHeader>> recordWithUniqueAttr : recordsWithSameId.entrySet()) {
|
||||
final RecordType record = recordWithUniqueAttr.getKey();
|
||||
final List<SAMFileHeader> fileHeaders = recordWithUniqueAttr.getValue();
|
||||
|
||||
String newId;
|
||||
if(!idsThatAreAlreadyTaken.contains(recordId)) {
|
||||
//don't remap 1st record. If there are more records
|
||||
//with this id, they will be remapped in the 'else'.
|
||||
newId = recordId;
|
||||
idsThatAreAlreadyTaken.add(recordId);
|
||||
} else {
|
||||
//there is more than one record with this id.
|
||||
hasCollisions = true;
|
||||
|
||||
//find a unique newId for this record
|
||||
int idx=1;
|
||||
while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++)))
|
||||
;
|
||||
|
||||
idsThatAreAlreadyTaken.add( newId );
|
||||
}
|
||||
|
||||
for(SAMFileHeader fileHeader : fileHeaders) {
|
||||
Map<String, String> readerTranslationTable = idTranslationTable.get(fileHeader);
|
||||
if(readerTranslationTable == null) {
|
||||
readerTranslationTable = new HashMap<String, String>();
|
||||
idTranslationTable.put(fileHeader, readerTranslationTable);
|
||||
}
|
||||
readerTranslationTable.put(recordId, newId);
|
||||
}
|
||||
|
||||
result.add( headerRecordFactory.createRecord(newId, record) );
|
||||
}
|
||||
}
|
||||
|
||||
return hasCollisions;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence
|
||||
* are different from one another.
|
||||
*
|
||||
* @param headers headers to pull sequences from
|
||||
* @return sequences from files. Each file should have the same sequence
|
||||
*/
|
||||
private SAMSequenceDictionary getSequenceDictionary(final Collection<SAMFileHeader> headers) {
|
||||
SAMSequenceDictionary sequences = null;
|
||||
for (final SAMFileHeader header : headers) {
|
||||
|
||||
if (sequences == null) {
|
||||
sequences = header.getSequenceDictionary();
|
||||
}
|
||||
else {
|
||||
final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
|
||||
SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences);
|
||||
}
|
||||
}
|
||||
|
||||
return sequences;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries.
|
||||
*
|
||||
* @param headers headers to pull sequences from
|
||||
* @return sequences from files. Each file should have the same sequence
|
||||
*/
|
||||
private SAMSequenceDictionary mergeSequenceDictionaries(final Collection<SAMFileHeader> headers) {
|
||||
SAMSequenceDictionary sequences = new SAMSequenceDictionary();
|
||||
for (final SAMFileHeader header : headers) {
|
||||
final SAMSequenceDictionary currentSequences = header.getSequenceDictionary();
|
||||
sequences = mergeSequences(sequences, currentSequences);
|
||||
}
|
||||
// second pass, make a map of the original seqeunce id -> new sequence id
|
||||
createSequenceMapping(headers, sequences);
|
||||
return sequences;
|
||||
}
|
||||
|
||||
/**
|
||||
* They've asked to merge the sequence headers. What we support right now is finding the sequence name superset.
|
||||
*
|
||||
* @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals.
|
||||
* @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict.
|
||||
* @return A new sequence dictionary that resulting from merging the two inputs.
|
||||
*/
|
||||
private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) {
|
||||
|
||||
// a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict.
|
||||
LinkedList<SAMSequenceRecord> holder = new LinkedList<SAMSequenceRecord>();
|
||||
|
||||
// Return value will be created from this.
|
||||
LinkedList<SAMSequenceRecord> resultingDict = new LinkedList<SAMSequenceRecord>();
|
||||
for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) {
|
||||
resultingDict.add(sequenceRecord);
|
||||
}
|
||||
|
||||
// Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
|
||||
int prevloc = -1;
|
||||
// Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict.
|
||||
SAMSequenceRecord previouslyMerged = null;
|
||||
|
||||
for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) {
|
||||
// Does it already exist in resultingDict?
|
||||
int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName());
|
||||
if (loc == -1) {
|
||||
// If doesn't already exist in resultingDict, save it an decide where to insert it later.
|
||||
holder.add(sequenceRecord.clone());
|
||||
} else if (prevloc > loc) {
|
||||
// If sequenceRecord already exists in resultingDict, but prior to the previous one
|
||||
// from mergeIntoDict that already existed, cannot merge.
|
||||
throw new PicardException("Cannot merge sequence dictionaries because sequence " +
|
||||
sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() +
|
||||
" are in different orders in two input sequence dictionaries.");
|
||||
} else {
|
||||
// Since sequenceRecord already exists in resultingDict, don't need to add it.
|
||||
// Add in all the sequences prior to it that have been held in holder.
|
||||
resultingDict.addAll(loc, holder);
|
||||
// Remember the index of sequenceRecord so can check for merge imcompatibility.
|
||||
prevloc = loc + holder.size();
|
||||
previouslyMerged = sequenceRecord;
|
||||
holder.clear();
|
||||
}
|
||||
}
|
||||
// Append anything left in holder.
|
||||
if (holder.size() != 0) {
|
||||
resultingDict.addAll(holder);
|
||||
}
|
||||
return new SAMSequenceDictionary(resultingDict);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find sequence in list.
|
||||
* @param list List to search for the sequence name.
|
||||
* @param sequenceName Name to search for.
|
||||
* @return Index of SAMSequenceRecord with the given name in list, or -1 if not found.
|
||||
*/
|
||||
private static int getIndexOfSequenceName(final List<SAMSequenceRecord> list, final String sequenceName) {
|
||||
for (int i = 0; i < list.size(); ++i) {
|
||||
if (list.get(i).getSequenceName().equals(sequenceName)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged
|
||||
* list of sequence id's.
|
||||
* @param headers the collections of headers.
|
||||
* @param masterDictionary the superset dictionary we've created.
|
||||
*/
|
||||
private void createSequenceMapping(final Collection<SAMFileHeader> headers, SAMSequenceDictionary masterDictionary) {
|
||||
LinkedList<String> resultingDictStr = new LinkedList<String>();
|
||||
for (SAMSequenceRecord r : masterDictionary.getSequences()) {
|
||||
resultingDictStr.add(r.getSequenceName());
|
||||
}
|
||||
for (final SAMFileHeader header : headers) {
|
||||
Map<Integer, Integer> seqMap = new HashMap<Integer, Integer>();
|
||||
SAMSequenceDictionary dict = header.getSequenceDictionary();
|
||||
for (SAMSequenceRecord rec : dict.getSequences()) {
|
||||
seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName()));
|
||||
}
|
||||
this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the read group id that should be used for the input read and RG id.
|
||||
*
|
||||
* @deprecated replaced by getReadGroupId(SAMFileHeader, String)
|
||||
* */
|
||||
public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) {
|
||||
return getReadGroupId(reader.getFileHeader(), originalReadGroupId);
|
||||
}
|
||||
|
||||
/** Returns the read group id that should be used for the input read and RG id. */
|
||||
public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) {
|
||||
return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param reader one of the input files
|
||||
* @param originalProgramGroupId a program group ID from the above input file
|
||||
* @return new ID from the merged list of program groups in the output file
|
||||
* @deprecated replaced by getProgramGroupId(SAMFileHeader, String)
|
||||
*/
|
||||
public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) {
|
||||
return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param header one of the input headers
|
||||
* @param originalProgramGroupId a program group ID from the above input file
|
||||
* @return new ID from the merged list of program groups in the output file
|
||||
*/
|
||||
public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) {
|
||||
return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId);
|
||||
}
|
||||
|
||||
/** Returns true if there are read group duplicates within the merged headers. */
|
||||
public boolean hasReadGroupCollisions() {
|
||||
return this.hasReadGroupCollisions;
|
||||
}
|
||||
|
||||
/** Returns true if there are program group duplicates within the merged headers. */
|
||||
public boolean hasProgramGroupCollisions() {
|
||||
return hasProgramGroupCollisions;
|
||||
}
|
||||
|
||||
/** @return if we've merged the sequence dictionaries, return true */
|
||||
public boolean hasMergedSequenceDictionary() {
|
||||
return hasMergedSequenceDictionary;
|
||||
}
|
||||
|
||||
/** Returns the merged header that should be written to any output merged file. */
|
||||
public SAMFileHeader getMergedHeader() {
|
||||
return this.mergedHeader;
|
||||
}
|
||||
|
||||
/** Returns the collection of readers that this header merger is working with. May return null.
|
||||
* @deprecated replaced by getHeaders()
|
||||
*/
|
||||
public Collection<SAMFileReader> getReaders() {
|
||||
return this.readers;
|
||||
}
|
||||
|
||||
/** Returns the collection of readers that this header merger is working with.
|
||||
*/
|
||||
public Collection<SAMFileHeader> getHeaders() {
|
||||
return this.headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells whether this header merger contains a given SAM file header. Note that header presence
|
||||
* is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for
|
||||
* reasons of performance.
|
||||
* @param header header to check for.
|
||||
* @return True if the header exists in this HeaderMerger. False otherwise.
|
||||
*/
|
||||
boolean containsHeader(SAMFileHeader header) {
|
||||
for(SAMFileHeader headerMergerHeader: headers) {
|
||||
if(headerMergerHeader == header)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the new mapping for a specified reader, given it's old sequence index
|
||||
* @param reader the reader
|
||||
* @param oldReferenceSequenceIndex the old sequence (also called reference) index
|
||||
* @return the new index value
|
||||
* @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer)
|
||||
*/
|
||||
public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) {
|
||||
return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Another mechanism for getting the new sequence index, for situations in which the reader is not available.
|
||||
* Note that if the SAMRecord has already had its header replaced with the merged header, this won't work.
|
||||
* @param header The original header for the input record in question.
|
||||
* @param oldReferenceSequenceIndex The original sequence index.
|
||||
* @return the new index value that is compatible with the merged sequence index.
|
||||
*/
|
||||
public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) {
|
||||
final Map<Integer, Integer> mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header);
|
||||
if (mapping == null) {
|
||||
throw new PicardException("No sequence dictionary mapping available for header: " + header);
|
||||
}
|
||||
|
||||
final Integer newIndex = mapping.get(oldReferenceSequenceIndex);
|
||||
if (newIndex == null) {
|
||||
throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header);
|
||||
}
|
||||
|
||||
return newIndex;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implementations of this interface are used by mergeHeaderRecords(..) to instantiate
|
||||
* specific subclasses of AbstractSAMHeaderRecord.
|
||||
*/
|
||||
private static interface HeaderRecordFactory<RecordType extends AbstractSAMHeaderRecord> {
|
||||
|
||||
/**
|
||||
* Constructs a new instance of RecordType.
|
||||
* @param id The id of the new record.
|
||||
* @param srcRecord Except for the id, the new record will be a copy of this source record.
|
||||
*/
|
||||
public RecordType createRecord(final String id, RecordType srcRecord);
|
||||
}
|
||||
|
||||
/**
|
||||
* Struct that groups together a subclass of AbstractSAMHeaderRecord with the
|
||||
* SAMFileHeader that it came from.
|
||||
*/
|
||||
private static class HeaderRecordAndFileHeader<RecordType extends AbstractSAMHeaderRecord> {
|
||||
private RecordType headerRecord;
|
||||
private SAMFileHeader samFileHeader;
|
||||
|
||||
public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) {
|
||||
this.headerRecord = headerRecord;
|
||||
this.samFileHeader = samFileHeader;
|
||||
}
|
||||
|
||||
public RecordType getHeaderRecord() {
|
||||
return headerRecord;
|
||||
}
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return samFileHeader;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,762 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools;
|
||||
|
||||
|
||||
import net.sf.samtools.util.*;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Internal class for reading and querying BAM files.
|
||||
*/
|
||||
class BAMFileReader extends SAMFileReader.ReaderImplementation {
|
||||
// True if reading from a File rather than an InputStream
|
||||
private boolean mIsSeekable = false;
|
||||
|
||||
// For converting bytes into other primitive types
|
||||
private BinaryCodec mStream = null;
|
||||
|
||||
// Underlying compressed data stream.
|
||||
private final BAMInputStream mInputStream;
|
||||
private SAMFileHeader mFileHeader = null;
|
||||
|
||||
// Populated if the file is seekable and an index exists
|
||||
private File mIndexFile;
|
||||
private BAMIndex mIndex = null;
|
||||
private long mFirstRecordPointer = 0;
|
||||
private CloseableIterator<SAMRecord> mCurrentIterator = null;
|
||||
|
||||
// If true, all SAMRecords are fully decoded as they are read.
|
||||
private final boolean eagerDecode;
|
||||
|
||||
// For error-checking.
|
||||
private ValidationStringency mValidationStringency;
|
||||
|
||||
// For creating BAMRecords
|
||||
private SAMRecordFactory samRecordFactory;
|
||||
|
||||
/**
|
||||
* Use the caching index reader implementation rather than the disk-hit-per-file model.
|
||||
*/
|
||||
private boolean mEnableIndexCaching = false;
|
||||
|
||||
/**
|
||||
* Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O.
|
||||
*/
|
||||
private boolean mEnableIndexMemoryMapping = true;
|
||||
|
||||
/**
|
||||
* Add information about the origin (reader and position) to SAM records.
|
||||
*/
|
||||
private SAMFileReader mFileReader = null;
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a stream (not seekable)
|
||||
* @param stream source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader(final InputStream stream,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
mIndexFile = indexFile;
|
||||
mIsSeekable = false;
|
||||
mInputStream = stream instanceof BAMInputStream ? (BAMInputStream)stream : new BlockCompressedInputStream(stream);
|
||||
mStream = new BinaryCodec(new DataInputStream((InputStream)mInputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
this.samRecordFactory = factory;
|
||||
readHeader(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to read BAM from a file (seekable)
|
||||
* @param file source of bytes.
|
||||
* @param eagerDecode if true, decode all BAM fields as reading rather than lazily.
|
||||
* @param validationStringency Controls how to handle invalidate reads or header lines.
|
||||
*/
|
||||
BAMFileReader(final File file,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : findIndexFile(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory);
|
||||
if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) {
|
||||
System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() +
|
||||
" is older than BAM " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
BAMFileReader(final SeekableStream strm,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
this(strm instanceof BAMInputStream ? (BAMInputStream)strm : new BlockCompressedInputStream(strm),
|
||||
indexFile,
|
||||
eagerDecode,
|
||||
strm.getSource(),
|
||||
validationStringency,
|
||||
factory);
|
||||
}
|
||||
|
||||
private BAMFileReader(final BAMInputStream inputStream,
|
||||
final File indexFile,
|
||||
final boolean eagerDecode,
|
||||
final String source,
|
||||
final ValidationStringency validationStringency,
|
||||
final SAMRecordFactory factory)
|
||||
throws IOException {
|
||||
mIndexFile = indexFile;
|
||||
mIsSeekable = true;
|
||||
mInputStream = inputStream;
|
||||
mStream = new BinaryCodec(new DataInputStream((InputStream)inputStream));
|
||||
this.eagerDecode = eagerDecode;
|
||||
this.mValidationStringency = validationStringency;
|
||||
this.samRecordFactory = factory;
|
||||
readHeader(source);
|
||||
mFirstRecordPointer = inputStream.getFilePointer();
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, writes the source of every read into the source SAMRecords.
|
||||
* @param enabled true to write source information into each SAMRecord.
|
||||
*/
|
||||
void enableFileSource(final SAMFileReader reader, final boolean enabled) {
|
||||
this.mFileReader = enabled ? reader : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, uses the caching version of the index reader.
|
||||
* @param enabled true to write source information into each SAMRecord.
|
||||
*/
|
||||
public void enableIndexCaching(final boolean enabled) {
|
||||
if(mIndex != null)
|
||||
throw new SAMException("Unable to turn on index caching; index file has already been loaded.");
|
||||
this.mEnableIndexCaching = enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping).
|
||||
* This is slower but more scalable when accessing large numbers of BAM files sequentially.
|
||||
* @param enabled True to use memory mapping, false to use regular I/O.
|
||||
*/
|
||||
public void enableIndexMemoryMapping(final boolean enabled) {
|
||||
if (mIndex != null) {
|
||||
throw new SAMException("Unable to change index memory mapping; index file has already been loaded.");
|
||||
}
|
||||
this.mEnableIndexMemoryMapping = enabled;
|
||||
}
|
||||
|
||||
@Override void enableCrcChecking(final boolean enabled) {
|
||||
this.mInputStream.setCheckCrcs(enabled);
|
||||
}
|
||||
|
||||
@Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; }
|
||||
|
||||
/**
|
||||
* @return true if ths is a BAM file, and has an index
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
return (mIndexFile != null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the index for the given file type. Ensure that the index is of the specified type.
|
||||
* @return An index of the given type.
|
||||
*/
|
||||
public BAMIndex getIndex() {
|
||||
if(mIndexFile == null)
|
||||
throw new SAMException("No index is available for this BAM file.");
|
||||
if(mIndex == null)
|
||||
mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping)
|
||||
: new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping);
|
||||
return mIndex;
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (mStream != null) {
|
||||
mStream.close();
|
||||
}
|
||||
if (mIndex != null) {
|
||||
mIndex.close();
|
||||
}
|
||||
mStream = null;
|
||||
mFileHeader = null;
|
||||
mIndex = null;
|
||||
}
|
||||
|
||||
SAMFileHeader getFileHeader() {
|
||||
return mFileHeader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set error-checking level for subsequent SAMRecord reads.
|
||||
*/
|
||||
void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) {
|
||||
this.mValidationStringency = validationStringency;
|
||||
}
|
||||
|
||||
SAMFileReader.ValidationStringency getValidationStringency() {
|
||||
return this.mValidationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords in file order.
|
||||
* Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once,
|
||||
* that iterator must be closed before getIterator() can be called again.
|
||||
* A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to
|
||||
* getIterator() begins its iteration where the last one left off. That is the best that can be
|
||||
* done in that situation.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> getIterator() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (mIsSeekable) {
|
||||
try {
|
||||
mInputStream.seek(mFirstRecordPointer);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
mCurrentIterator = new BAMFileIterator();
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!(chunks instanceof BAMFileSpan)) {
|
||||
throw new IllegalStateException("BAMFileReader cannot handle this type of file span.");
|
||||
}
|
||||
|
||||
// Create an iterator over the given chunk boundaries.
|
||||
mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray());
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know
|
||||
* when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However,
|
||||
* the rightmost bound is guaranteed to be after the last read in the file.
|
||||
* @return An unbounded pointer to the first record in the BAM file.
|
||||
*/
|
||||
@Override
|
||||
SAMFileSpan getFilePointerSpanningReads() {
|
||||
return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords that match the given interval.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end.
|
||||
* A value of zero implies the start of the reference sequence.
|
||||
* @param end A value of zero implies the end of the reference sequence.
|
||||
* @param contained If true, the alignments for the SAMRecords must be completely contained in the interval
|
||||
* specified by start and end. If false, the SAMRecords need only overlap the interval.
|
||||
* @return Iterator for the matching SAMRecords
|
||||
*/
|
||||
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, end, contained? QueryType.CONTAINED: QueryType.OVERLAPPING);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through the SAMRecords with the given alignment start.
|
||||
* Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed
|
||||
* before calling any of the methods that return an iterator.
|
||||
*
|
||||
* Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting
|
||||
* purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate
|
||||
* matches the specified interval.
|
||||
*
|
||||
* Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect
|
||||
* resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval.
|
||||
*
|
||||
* @param sequence Reference sequence sought.
|
||||
* @param start Alignment start sought.
|
||||
* @return Iterator for the matching SAMRecords.
|
||||
*/
|
||||
CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
mCurrentIterator = createIndexIterator(sequence, start, -1, QueryType.STARTING_AT);
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
public CloseableIterator<SAMRecord> queryUnmapped() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (!mIsSeekable) {
|
||||
throw new UnsupportedOperationException("Cannot query stream-based BAM file");
|
||||
}
|
||||
try {
|
||||
final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin();
|
||||
if (startOfLastLinearBin != -1) {
|
||||
mInputStream.seek(startOfLastLinearBin);
|
||||
} else {
|
||||
// No mapped reads in file, just start at the first read in file.
|
||||
mInputStream.seek(mFirstRecordPointer);
|
||||
}
|
||||
mCurrentIterator = new BAMFileIndexUnmappedIterator();
|
||||
return mCurrentIterator;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IOException seeking to unmapped reads", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the header from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private void readHeader(final String source)
|
||||
throws IOException {
|
||||
|
||||
final byte[] buffer = new byte[4];
|
||||
mStream.readBytes(buffer);
|
||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
||||
throw new IOException("Invalid BAM file header");
|
||||
}
|
||||
|
||||
final int headerTextLength = mStream.readInt();
|
||||
final String textHeader = mStream.readString(headerTextLength);
|
||||
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
||||
headerCodec.setValidationStringency(mValidationStringency);
|
||||
mFileHeader = headerCodec.decode(new StringLineReader(textHeader),
|
||||
source);
|
||||
|
||||
final int sequenceCount = mStream.readInt();
|
||||
if (mFileHeader.getSequenceDictionary().size() > 0) {
|
||||
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||
if (sequenceCount != mFileHeader.getSequenceDictionary().size()) {
|
||||
throw new SAMFormatException("Number of sequences in text header (" +
|
||||
mFileHeader.getSequenceDictionary().size() +
|
||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + source);
|
||||
}
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(source);
|
||||
final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
||||
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||
source);
|
||||
}
|
||||
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||
source);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If only binary sequences are present, copy them into mFileHeader
|
||||
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
sequences.add(readSequenceRecord(source));
|
||||
}
|
||||
mFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a single binary sequence record from the file or stream
|
||||
* @param source Note that this is used only for reporting errors.
|
||||
*/
|
||||
private SAMSequenceRecord readSequenceRecord(final String source) {
|
||||
final int nameLength = mStream.readInt();
|
||||
if (nameLength <= 1) {
|
||||
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source);
|
||||
}
|
||||
final String sequenceName = mStream.readString(nameLength - 1);
|
||||
// Skip the null terminator
|
||||
mStream.readByte();
|
||||
final int sequenceLength = mStream.readInt();
|
||||
return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator for non-indexed sequential iteration through all SAMRecords in file.
|
||||
* Starting point of iteration is wherever current file position is when the iterator is constructed.
|
||||
*/
|
||||
private class BAMFileIterator implements CloseableIterator<SAMRecord> {
|
||||
private SAMRecord mNextRecord = null;
|
||||
private final BAMRecordCodec bamRecordCodec;
|
||||
private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file
|
||||
|
||||
BAMFileIterator() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param advance Trick to enable subclass to do more setup before advancing
|
||||
*/
|
||||
BAMFileIterator(final boolean advance) {
|
||||
this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory);
|
||||
this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream());
|
||||
|
||||
if (advance) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mCurrentIterator != null && this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
final SAMRecord result = mNextRecord;
|
||||
advance();
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
void advance() {
|
||||
try {
|
||||
mNextRecord = getNextRecord();
|
||||
|
||||
if (mNextRecord != null) {
|
||||
++this.samRecordIndex;
|
||||
// Because some decoding is done lazily, the record needs to remember the validation stringency.
|
||||
mNextRecord.setValidationStringency(mValidationStringency);
|
||||
|
||||
if (mValidationStringency != ValidationStringency.SILENT) {
|
||||
final List<SAMValidationError> validationErrors = mNextRecord.isValid();
|
||||
SAMUtils.processValidationErrors(validationErrors,
|
||||
this.samRecordIndex, BAMFileReader.this.getValidationStringency());
|
||||
}
|
||||
}
|
||||
if (eagerDecode && mNextRecord != null) {
|
||||
mNextRecord.eagerDecode();
|
||||
}
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next record from the input stream.
|
||||
*/
|
||||
SAMRecord getNextRecord() throws IOException {
|
||||
final long startCoordinate = mInputStream.getFilePointer();
|
||||
final SAMRecord next = bamRecordCodec.decode();
|
||||
final long stopCoordinate = mInputStream.getFilePointer();
|
||||
|
||||
if(mFileReader != null && next != null)
|
||||
next.setFileSource(new SAMFileSource(mFileReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate))));
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The record that will be return by the next call to next()
|
||||
*/
|
||||
protected SAMRecord peek() {
|
||||
return mNextRecord;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare to iterate through SAMRecords matching the target interval.
|
||||
* @param sequence Desired reference sequence.
|
||||
* @param start 1-based start of target interval, inclusive.
|
||||
* @param end 1-based end of target interval, inclusive.
|
||||
* @param queryType contained, overlapping, or starting-at query.
|
||||
*/
|
||||
private CloseableIterator<SAMRecord> createIndexIterator(final String sequence,
|
||||
final int start,
|
||||
final int end,
|
||||
final QueryType queryType) {
|
||||
long[] filePointers = null;
|
||||
|
||||
// Hit the index to determine the chunk boundaries for the required data.
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
final int referenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
if (referenceIndex != -1) {
|
||||
final BAMIndex fileIndex = getIndex();
|
||||
final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, end);
|
||||
filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null;
|
||||
}
|
||||
|
||||
// Create an iterator over the above chunk boundaries.
|
||||
final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers);
|
||||
|
||||
// Add some preprocessing filters for edge-case reads that don't fit into this
|
||||
// query type.
|
||||
return new BAMQueryFilteringIterator(iterator,sequence,start,end,queryType);
|
||||
}
|
||||
|
||||
enum QueryType {CONTAINED, OVERLAPPING, STARTING_AT}
|
||||
|
||||
/**
|
||||
* Look for BAM index file according to standard naming convention.
|
||||
*
|
||||
* @param dataFile BAM file name.
|
||||
* @return Index file name, or null if not found.
|
||||
*/
|
||||
private static File findIndexFile(final File dataFile) {
|
||||
// If input is foo.bam, look for foo.bai
|
||||
final String bamExtension = ".bam";
|
||||
File indexFile;
|
||||
final String fileName = dataFile.getName();
|
||||
if (fileName.endsWith(bamExtension)) {
|
||||
final String bai = fileName.substring(0, fileName.length() - bamExtension.length()) + BAMIndex.BAMIndexSuffix;
|
||||
indexFile = new File(dataFile.getParent(), bai);
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
}
|
||||
}
|
||||
|
||||
// If foo.bai doesn't exist look for foo.bam.bai
|
||||
indexFile = new File(dataFile.getParent(), dataFile.getName() + ".bai");
|
||||
if (indexFile.exists()) {
|
||||
return indexFile;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexIterator extends BAMFileIterator {
|
||||
|
||||
private long[] mFilePointers = null;
|
||||
private int mFilePointerIndex = 0;
|
||||
private long mFilePointerLimit = -1;
|
||||
|
||||
/**
|
||||
* Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset.
|
||||
* @param filePointers the block / offset combination, stored in chunk format.
|
||||
*/
|
||||
BAMFileIndexIterator(final long[] filePointers) {
|
||||
super(false); // delay advance() until after construction
|
||||
mFilePointers = filePointers;
|
||||
advance();
|
||||
}
|
||||
|
||||
SAMRecord getNextRecord()
|
||||
throws IOException {
|
||||
// Advance to next file block if necessary
|
||||
while (mInputStream.getFilePointer() >= mFilePointerLimit) {
|
||||
if (mFilePointers == null ||
|
||||
mFilePointerIndex >= mFilePointers.length) {
|
||||
return null;
|
||||
}
|
||||
final long startOffset = mFilePointers[mFilePointerIndex++];
|
||||
final long endOffset = mFilePointers[mFilePointerIndex++];
|
||||
mInputStream.seek(startOffset);
|
||||
mFilePointerLimit = endOffset;
|
||||
}
|
||||
// Pull next record from stream
|
||||
return super.getNextRecord();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A decorating iterator that filters out records that are outside the bounds of the
|
||||
* given query parameters.
|
||||
*/
|
||||
private class BAMQueryFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
* The wrapped iterator.
|
||||
*/
|
||||
private final CloseableIterator<SAMRecord> wrappedIterator;
|
||||
|
||||
/**
|
||||
* The next record to be returned. Will be null if no such record exists.
|
||||
*/
|
||||
private SAMRecord mNextRecord;
|
||||
|
||||
private final int mReferenceIndex;
|
||||
private final int mRegionStart;
|
||||
private final int mRegionEnd;
|
||||
private final QueryType mQueryType;
|
||||
|
||||
public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator,final String sequence, final int start, final int end, final QueryType queryType) {
|
||||
this.wrappedIterator = iterator;
|
||||
final SAMFileHeader fileHeader = getFileHeader();
|
||||
mReferenceIndex = fileHeader.getSequenceIndex(sequence);
|
||||
mRegionStart = start;
|
||||
if (queryType == QueryType.STARTING_AT) {
|
||||
mRegionEnd = mRegionStart;
|
||||
} else {
|
||||
mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end;
|
||||
}
|
||||
mQueryType = queryType;
|
||||
mNextRecord = advance();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if a next element exists; false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return mNextRecord != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next record from the given iterator.
|
||||
* @return The next SAM record in the iterator.
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available");
|
||||
final SAMRecord currentRead = mNextRecord;
|
||||
mNextRecord = advance();
|
||||
return currentRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes down the existing iterator.
|
||||
*/
|
||||
public void close() {
|
||||
if (this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
SAMRecord advance() {
|
||||
while (true) {
|
||||
// Pull next record from stream
|
||||
if(!wrappedIterator.hasNext())
|
||||
return null;
|
||||
|
||||
final SAMRecord record = wrappedIterator.next();
|
||||
// If beyond the end of this reference sequence, end iteration
|
||||
final int referenceIndex = record.getReferenceIndex();
|
||||
if (referenceIndex != mReferenceIndex) {
|
||||
if (referenceIndex < 0 ||
|
||||
referenceIndex > mReferenceIndex) {
|
||||
return null;
|
||||
}
|
||||
// If before this reference sequence, continue
|
||||
continue;
|
||||
}
|
||||
if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) {
|
||||
// Quick exit to avoid expensive alignment end calculation
|
||||
return record;
|
||||
}
|
||||
final int alignmentStart = record.getAlignmentStart();
|
||||
// If read is unmapped but has a coordinate, return it if the coordinate is within
|
||||
// the query region, regardless of whether the mapped mate will be returned.
|
||||
final int alignmentEnd;
|
||||
if (mQueryType == QueryType.STARTING_AT) {
|
||||
alignmentEnd = -1;
|
||||
} else {
|
||||
alignmentEnd = (record.getAlignmentEnd() != SAMRecord.NO_ALIGNMENT_START?
|
||||
record.getAlignmentEnd(): alignmentStart);
|
||||
}
|
||||
|
||||
if (alignmentStart > mRegionEnd) {
|
||||
// If scanned beyond target region, end iteration
|
||||
return null;
|
||||
}
|
||||
// Filter for overlap with region
|
||||
if (mQueryType == QueryType.CONTAINED) {
|
||||
if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else if (mQueryType == QueryType.OVERLAPPING) {
|
||||
if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) {
|
||||
return record;
|
||||
}
|
||||
} else {
|
||||
if (alignmentStart == mRegionStart) {
|
||||
return record;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class BAMFileIndexUnmappedIterator extends BAMFileIterator {
|
||||
private BAMFileIndexUnmappedIterator() {
|
||||
while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,6 +25,7 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
|
@ -47,6 +48,18 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GATKBAMFileSpan from an existing BAMFileSpan.
|
||||
* @param sourceFileSpan
|
||||
*/
|
||||
public GATKBAMFileSpan(SAMFileSpan sourceFileSpan) {
|
||||
if(!(sourceFileSpan instanceof BAMFileSpan))
|
||||
throw new SAMException("Unable to create GATKBAMFileSpan from a SAMFileSpan. Please submit a BAMFileSpan instead");
|
||||
BAMFileSpan sourceBAMFileSpan = (BAMFileSpan)sourceFileSpan;
|
||||
for(Chunk chunk: sourceBAMFileSpan.getChunks())
|
||||
add(chunk instanceof GATKChunk ? chunk : new GATKChunk(chunk));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience constructor to construct a BAM file span from
|
||||
* a single chunk.
|
||||
|
|
|
|||
|
|
@ -69,6 +69,22 @@ public class GATKChunk extends Chunk {
|
|||
super.setChunkEnd(value);
|
||||
}
|
||||
|
||||
public long getBlockStart() {
|
||||
return getChunkStart() >>> 16;
|
||||
}
|
||||
|
||||
public int getBlockOffsetStart() {
|
||||
return (int)(getChunkStart() & 0xFFFF);
|
||||
}
|
||||
|
||||
public long getBlockEnd() {
|
||||
return getChunkEnd() >>> 16;
|
||||
}
|
||||
|
||||
public int getBlockOffsetEnd() {
|
||||
return ((int)getChunkEnd() & 0xFFFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes an approximation of the uncompressed size of the
|
||||
* chunk, in bytes. Can be used to determine relative weights
|
||||
|
|
|
|||
|
|
@ -0,0 +1,72 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package net.sf.samtools.util;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* An input stream formulated for use reading BAM files. Supports
|
||||
*/
|
||||
public interface BAMInputStream {
|
||||
/**
|
||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||
* not an actual byte offset.
|
||||
*
|
||||
* @param pos virtual file pointer
|
||||
*/
|
||||
public void seek(final long pos) throws IOException;
|
||||
|
||||
/**
|
||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||
* the two.
|
||||
*/
|
||||
public long getFilePointer();
|
||||
|
||||
/**
|
||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||
* operation and should be used accordingly.
|
||||
*/
|
||||
public void setCheckCrcs(final boolean check);
|
||||
|
||||
public int read() throws java.io.IOException;
|
||||
|
||||
public int read(byte[] bytes) throws java.io.IOException;
|
||||
|
||||
public int read(byte[] bytes, int i, int i1) throws java.io.IOException;
|
||||
|
||||
public long skip(long l) throws java.io.IOException;
|
||||
|
||||
public int available() throws java.io.IOException;
|
||||
|
||||
public void close() throws java.io.IOException;
|
||||
|
||||
public void mark(int i);
|
||||
|
||||
public void reset() throws java.io.IOException;
|
||||
|
||||
public boolean markSupported();
|
||||
}
|
||||
|
|
@ -0,0 +1,483 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
package net.sf.samtools.util;
|
||||
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.net.URL;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
|
||||
import net.sf.samtools.FileTruncatedException;
|
||||
|
||||
/*
|
||||
* Utility class for reading BGZF block compressed files. The caller can treat this file like any other InputStream.
|
||||
* It probably is not necessary to wrap this stream in a buffering stream, because there is internal buffering.
|
||||
* The advantage of BGZF over conventional GZip format is that BGZF allows for seeking without having to read the
|
||||
* entire file up to the location being sought. Note that seeking is only possible if the ctor(File) is used.
|
||||
*
|
||||
* c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF format
|
||||
*/
|
||||
public class BlockCompressedInputStream extends InputStream implements BAMInputStream {
|
||||
private InputStream mStream = null;
|
||||
private SeekableStream mFile = null;
|
||||
private byte[] mFileBuffer = null;
|
||||
private byte[] mCurrentBlock = null;
|
||||
private int mCurrentOffset = 0;
|
||||
private long mBlockAddress = 0;
|
||||
private int mLastBlockLength = 0;
|
||||
private final BlockGunzipper blockGunzipper = new BlockGunzipper();
|
||||
|
||||
|
||||
/**
|
||||
* Note that seek() is not supported if this ctor is used.
|
||||
*/
|
||||
public BlockCompressedInputStream(final InputStream stream) {
|
||||
mStream = IOUtil.toBufferedStream(stream);
|
||||
mFile = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this ctor if you wish to call seek()
|
||||
*/
|
||||
public BlockCompressedInputStream(final File file)
|
||||
throws IOException {
|
||||
mFile = new SeekableFileStream(file);
|
||||
mStream = null;
|
||||
|
||||
}
|
||||
|
||||
public BlockCompressedInputStream(final URL url) {
|
||||
mFile = new SeekableBufferedStream(new SeekableHTTPStream(url));
|
||||
mStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* For providing some arbitrary data source. No additional buffering is
|
||||
* provided, so if the underlying source is not buffered, wrap it in a
|
||||
* SeekableBufferedStream before passing to this ctor.
|
||||
*/
|
||||
public BlockCompressedInputStream(final SeekableStream strm) {
|
||||
mFile = strm;
|
||||
mStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether or not the inflater will re-calculated the CRC on the decompressed data
|
||||
* and check it against the value stored in the GZIP header. CRC checking is an expensive
|
||||
* operation and should be used accordingly.
|
||||
*/
|
||||
public void setCheckCrcs(final boolean check) {
|
||||
this.blockGunzipper.setCheckCrcs(check);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of bytes that can be read (or skipped over) from this input stream without blocking by the
|
||||
* next caller of a method for this input stream. The next caller might be the same thread or another thread.
|
||||
* Note that although the next caller can read this many bytes without blocking, the available() method call itself
|
||||
* may block in order to fill an internal buffer if it has been exhausted.
|
||||
*/
|
||||
public int available()
|
||||
throws IOException {
|
||||
if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) {
|
||||
readBlock();
|
||||
}
|
||||
if (mCurrentBlock == null) {
|
||||
return 0;
|
||||
}
|
||||
return mCurrentBlock.length - mCurrentOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the underlying InputStream or RandomAccessFile
|
||||
*/
|
||||
public void close()
|
||||
throws IOException {
|
||||
if (mFile != null) {
|
||||
mFile.close();
|
||||
mFile = null;
|
||||
} else if (mStream != null) {
|
||||
mStream.close();
|
||||
mStream = null;
|
||||
}
|
||||
// Encourage garbage collection
|
||||
mFileBuffer = null;
|
||||
mCurrentBlock = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next byte of data from the input stream. The value byte is returned as an int in the range 0 to 255.
|
||||
* If no byte is available because the end of the stream has been reached, the value -1 is returned.
|
||||
* This method blocks until input data is available, the end of the stream is detected, or an exception is thrown.
|
||||
|
||||
* @return the next byte of data, or -1 if the end of the stream is reached.
|
||||
*/
|
||||
public int read()
|
||||
throws IOException {
|
||||
return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads some number of bytes from the input stream and stores them into the buffer array b. The number of bytes
|
||||
* actually read is returned as an integer. This method blocks until input data is available, end of file is detected,
|
||||
* or an exception is thrown.
|
||||
*
|
||||
* read(buf) has the same effect as read(buf, 0, buf.length).
|
||||
*
|
||||
* @param buffer the buffer into which the data is read.
|
||||
* @return the total number of bytes read into the buffer, or -1 is there is no more data because the end of
|
||||
* the stream has been reached.
|
||||
*/
|
||||
public int read(final byte[] buffer)
|
||||
throws IOException {
|
||||
return read(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
private volatile ByteArrayOutputStream buf = null;
|
||||
private static final byte eol = '\n';
|
||||
private static final byte eolCr = '\r';
|
||||
|
||||
/**
|
||||
* Reads a whole line. A line is considered to be terminated by either a line feed ('\n'),
|
||||
* carriage return ('\r') or carriage return followed by a line feed ("\r\n").
|
||||
*
|
||||
* @return A String containing the contents of the line, excluding the line terminating
|
||||
* character, or null if the end of the stream has been reached
|
||||
*
|
||||
* @exception IOException If an I/O error occurs
|
||||
* @
|
||||
*/
|
||||
public String readLine() throws IOException {
|
||||
int available = available();
|
||||
if (available == 0) {
|
||||
return null;
|
||||
}
|
||||
if(null == buf){ // lazy initialisation
|
||||
buf = new ByteArrayOutputStream(8192);
|
||||
}
|
||||
buf.reset();
|
||||
boolean done = false;
|
||||
boolean foundCr = false; // \r found flag
|
||||
while (!done) {
|
||||
int linetmpPos = mCurrentOffset;
|
||||
int bCnt = 0;
|
||||
while((available-- > 0)){
|
||||
final byte c = mCurrentBlock[linetmpPos++];
|
||||
if(c == eol){ // found \n
|
||||
done = true;
|
||||
break;
|
||||
} else if(foundCr){ // previous char was \r
|
||||
--linetmpPos; // current char is not \n so put it back
|
||||
done = true;
|
||||
break;
|
||||
} else if(c == eolCr){ // found \r
|
||||
foundCr = true;
|
||||
continue; // no ++bCnt
|
||||
}
|
||||
++bCnt;
|
||||
}
|
||||
if(mCurrentOffset < linetmpPos){
|
||||
buf.write(mCurrentBlock, mCurrentOffset, bCnt);
|
||||
mCurrentOffset = linetmpPos;
|
||||
}
|
||||
available = available();
|
||||
if(available == 0){
|
||||
// EOF
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads up to len bytes of data from the input stream into an array of bytes. An attempt is made to read
|
||||
* as many as len bytes, but a smaller number may be read. The number of bytes actually read is returned as an integer.
|
||||
*
|
||||
* This method blocks until input data is available, end of file is detected, or an exception is thrown.
|
||||
*
|
||||
* @param buffer buffer into which data is read.
|
||||
* @param offset the start offset in array b at which the data is written.
|
||||
* @param length the maximum number of bytes to read.
|
||||
* @return the total number of bytes read into the buffer, or -1 if there is no more data because the end of
|
||||
* the stream has been reached.
|
||||
*/
|
||||
public int read(final byte[] buffer, int offset, int length)
|
||||
throws IOException {
|
||||
final int originalLength = length;
|
||||
while (length > 0) {
|
||||
final int available = available();
|
||||
if (available == 0) {
|
||||
// Signal EOF to caller
|
||||
if (originalLength == length) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
final int copyLength = Math.min(length, available);
|
||||
System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength);
|
||||
mCurrentOffset += copyLength;
|
||||
offset += copyLength;
|
||||
length -= copyLength;
|
||||
}
|
||||
return originalLength - length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seek to the given position in the file. Note that pos is a special virtual file pointer,
|
||||
* not an actual byte offset.
|
||||
*
|
||||
* @param pos virtual file pointer
|
||||
*/
|
||||
public void seek(final long pos)
|
||||
throws IOException {
|
||||
if (mFile == null) {
|
||||
throw new IOException("Cannot seek on stream based file");
|
||||
}
|
||||
// Decode virtual file pointer
|
||||
// Upper 48 bits is the byte offset into the compressed stream of a block.
|
||||
// Lower 16 bits is the byte offset into the uncompressed stream inside the block.
|
||||
final long compressedOffset = BlockCompressedFilePointerUtil.getBlockAddress(pos);
|
||||
final int uncompressedOffset = BlockCompressedFilePointerUtil.getBlockOffset(pos);
|
||||
final int available;
|
||||
if (mBlockAddress == compressedOffset && mCurrentBlock != null) {
|
||||
available = mCurrentBlock.length;
|
||||
} else {
|
||||
mFile.seek(compressedOffset);
|
||||
mBlockAddress = compressedOffset;
|
||||
mLastBlockLength = 0;
|
||||
readBlock();
|
||||
available = available();
|
||||
}
|
||||
if (uncompressedOffset > available ||
|
||||
(uncompressedOffset == available && !eof())) {
|
||||
throw new IOException("Invalid file pointer: " + pos);
|
||||
}
|
||||
mCurrentOffset = uncompressedOffset;
|
||||
}
|
||||
|
||||
private boolean eof() throws IOException {
|
||||
if (mFile.eof()) {
|
||||
return true;
|
||||
}
|
||||
// If the last remaining block is the size of the EMPTY_GZIP_BLOCK, this is the same as being at EOF.
|
||||
return (mFile.length() - (mBlockAddress + mLastBlockLength) == BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return virtual file pointer that can be passed to seek() to return to the current position. This is
|
||||
* not an actual byte offset, so arithmetic on file pointers cannot be done to determine the distance between
|
||||
* the two.
|
||||
*/
|
||||
public long getFilePointer() {
|
||||
if (mCurrentOffset == mCurrentBlock.length) {
|
||||
// If current offset is at the end of the current block, file pointer should point
|
||||
// to the beginning of the next block.
|
||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress + mLastBlockLength, 0);
|
||||
}
|
||||
return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, mCurrentOffset);
|
||||
}
|
||||
|
||||
public static long getFileBlock(final long bgzfOffset) {
|
||||
return BlockCompressedFilePointerUtil.getBlockAddress(bgzfOffset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stream Must be at start of file. Throws RuntimeException if !stream.markSupported().
|
||||
* @return true if the given file looks like a valid BGZF file.
|
||||
*/
|
||||
public static boolean isValidFile(final InputStream stream)
|
||||
throws IOException {
|
||||
if (!stream.markSupported()) {
|
||||
throw new RuntimeException("Cannot test non-buffered stream");
|
||||
}
|
||||
stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH];
|
||||
final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
stream.reset();
|
||||
return count == BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH && isValidBlockHeader(buffer);
|
||||
}
|
||||
|
||||
private static boolean isValidBlockHeader(final byte[] buffer) {
|
||||
return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 &&
|
||||
(buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 &&
|
||||
(buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 &&
|
||||
buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN &&
|
||||
buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 &&
|
||||
buffer[13] == BlockCompressedStreamConstants.BGZF_ID2);
|
||||
}
|
||||
|
||||
private void readBlock()
|
||||
throws IOException {
|
||||
|
||||
if (mFileBuffer == null) {
|
||||
mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE];
|
||||
}
|
||||
int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
if (count == 0) {
|
||||
// Handle case where there is no empty gzip block at end.
|
||||
mCurrentOffset = 0;
|
||||
mBlockAddress += mLastBlockLength;
|
||||
mCurrentBlock = new byte[0];
|
||||
return;
|
||||
}
|
||||
if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) {
|
||||
throw new IOException("Premature end of file");
|
||||
}
|
||||
final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1;
|
||||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) {
|
||||
throw new IOException("Unexpected compressed block length: " + blockLength);
|
||||
}
|
||||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH;
|
||||
count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining);
|
||||
if (count != remaining) {
|
||||
throw new FileTruncatedException("Premature end of file");
|
||||
}
|
||||
inflateBlock(mFileBuffer, blockLength);
|
||||
mCurrentOffset = 0;
|
||||
mBlockAddress += mLastBlockLength;
|
||||
mLastBlockLength = blockLength;
|
||||
}
|
||||
|
||||
private void inflateBlock(final byte[] compressedBlock, final int compressedLength)
|
||||
throws IOException {
|
||||
final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4);
|
||||
byte[] buffer = mCurrentBlock;
|
||||
mCurrentBlock = null;
|
||||
if (buffer == null || buffer.length != uncompressedLength) {
|
||||
try {
|
||||
buffer = new byte[uncompressedLength];
|
||||
} catch (NegativeArraySizeException e) {
|
||||
throw new RuntimeException("BGZF file has invalid uncompressedLength: " + uncompressedLength, e);
|
||||
}
|
||||
}
|
||||
blockGunzipper.unzipBlock(buffer, compressedBlock, compressedLength);
|
||||
mCurrentBlock = buffer;
|
||||
}
|
||||
|
||||
private int readBytes(final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
if (mFile != null) {
|
||||
return readBytes(mFile, buffer, offset, length);
|
||||
} else if (mStream != null) {
|
||||
return readBytes(mStream, buffer, offset, length);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static int readBytes(final SeekableStream file, final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
int bytesRead = 0;
|
||||
while (bytesRead < length) {
|
||||
final int count = file.read(buffer, offset + bytesRead, length - bytesRead);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
bytesRead += count;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
|
||||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length)
|
||||
throws IOException {
|
||||
int bytesRead = 0;
|
||||
while (bytesRead < length) {
|
||||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
bytesRead += count;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
|
||||
private int unpackInt16(final byte[] buffer, final int offset) {
|
||||
return ((buffer[offset] & 0xFF) |
|
||||
((buffer[offset+1] & 0xFF) << 8));
|
||||
}
|
||||
|
||||
private int unpackInt32(final byte[] buffer, final int offset) {
|
||||
return ((buffer[offset] & 0xFF) |
|
||||
((buffer[offset+1] & 0xFF) << 8) |
|
||||
((buffer[offset+2] & 0xFF) << 16) |
|
||||
((buffer[offset+3] & 0xFF) << 24));
|
||||
}
|
||||
|
||||
public enum FileTermination {HAS_TERMINATOR_BLOCK, HAS_HEALTHY_LAST_BLOCK, DEFECTIVE}
|
||||
|
||||
public static FileTermination checkTermination(final File file)
|
||||
throws IOException {
|
||||
final long fileSize = file.length();
|
||||
if (fileSize < BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length) {
|
||||
return FileTermination.DEFECTIVE;
|
||||
}
|
||||
final RandomAccessFile raFile = new RandomAccessFile(file, "r");
|
||||
try {
|
||||
raFile.seek(fileSize - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
byte[] buf = new byte[BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length];
|
||||
raFile.readFully(buf);
|
||||
if (Arrays.equals(buf, BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK)) {
|
||||
return FileTermination.HAS_TERMINATOR_BLOCK;
|
||||
}
|
||||
final int bufsize = (int)Math.min(fileSize, BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE);
|
||||
buf = new byte[bufsize];
|
||||
raFile.seek(fileSize - bufsize);
|
||||
raFile.read(buf);
|
||||
for (int i = buf.length - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length;
|
||||
i >= 0; --i) {
|
||||
if (!preambleEqual(BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE,
|
||||
buf, i, BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length)) {
|
||||
continue;
|
||||
}
|
||||
final ByteBuffer byteBuffer = ByteBuffer.wrap(buf, i + BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length, 4);
|
||||
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
final int totalBlockSizeMinusOne = byteBuffer.getShort() & 0xFFFF;
|
||||
if (buf.length - i == totalBlockSizeMinusOne + 1) {
|
||||
return FileTermination.HAS_HEALTHY_LAST_BLOCK;
|
||||
} else {
|
||||
return FileTermination.DEFECTIVE;
|
||||
}
|
||||
}
|
||||
return FileTermination.DEFECTIVE;
|
||||
} finally {
|
||||
raFile.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean preambleEqual(final byte[] preamble, final byte[] buf, final int startOffset, final int length) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (preamble[i] != buf[i + startOffset]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -331,12 +331,12 @@ public abstract class CommandLineProgram {
|
|||
* used to indicate an error occured
|
||||
*
|
||||
* @param msg the message
|
||||
* @param e the error
|
||||
* @param t the error
|
||||
*/
|
||||
public static void exitSystemWithError(String msg, final Exception e) {
|
||||
public static void exitSystemWithError(String msg, final Throwable t) {
|
||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||
errorPrintf("stack trace %n");
|
||||
e.printStackTrace();
|
||||
t.printStackTrace();
|
||||
|
||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||
errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber());
|
||||
|
|
@ -392,10 +392,10 @@ public abstract class CommandLineProgram {
|
|||
/**
|
||||
* used to indicate an error occured
|
||||
*
|
||||
* @param e the exception occured
|
||||
* @param t the exception that occurred
|
||||
*/
|
||||
public static void exitSystemWithError(Exception e) {
|
||||
exitSystemWithError(e.getMessage(), e);
|
||||
public static void exitSystemWithError(Throwable t) {
|
||||
exitSystemWithError(t.getMessage(), t);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ import java.util.*;
|
|||
*
|
||||
* The IntervalBinding<T> is a formal GATK argument that bridges between a walker and
|
||||
* the engine to construct intervals for traversal at runtime. The IntervalBinding can
|
||||
* either be a RodBinding<T>, a string of one or more intervals, or a file with interval strings.
|
||||
* either be a RodBinding<T>, a string of one interval, or a file with interval strings.
|
||||
* The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it.
|
||||
*
|
||||
* Note that this class is immutable.
|
||||
|
|
@ -92,7 +92,10 @@ public final class IntervalBinding<T extends Feature> {
|
|||
codec.readHeader(lineReader);
|
||||
String line = lineReader.readLine();
|
||||
while ( line != null ) {
|
||||
intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line)));
|
||||
final Feature feature = codec.decodeLoc(line);
|
||||
if ( feature == null )
|
||||
throw new UserException.MalformedFile(featureIntervals.getSource(), "Couldn't parse line '" + line + "'");
|
||||
intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(feature));
|
||||
line = lineReader.readLine();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
|
|
@ -105,4 +108,8 @@ public final class IntervalBinding<T extends Feature> {
|
|||
|
||||
return intervals;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getSource();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.Attribution;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
|
|
@ -97,13 +96,20 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
// lazy loaded, so they aren't caught elsewhere and made into User Exceptions
|
||||
exitSystemWithUserError(e);
|
||||
} catch (net.sf.samtools.SAMException e) {
|
||||
// Let's try this out and see how it is received by our users
|
||||
checkForTooManyOpenFilesProblem(e.getMessage());
|
||||
exitSystemWithSamError(e);
|
||||
} catch (Exception e) {
|
||||
exitSystemWithError(e);
|
||||
} catch (Throwable t) {
|
||||
checkForTooManyOpenFilesProblem(t.getMessage());
|
||||
exitSystemWithError(t);
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkForTooManyOpenFilesProblem(String message) {
|
||||
// Special case the "Too many open files" error because it's a common User Error for which we know what to do
|
||||
if ( message != null && message.indexOf("Too many open files") != -1 )
|
||||
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the a short blurb about the GATK, copyright info, and where to get documentation.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
|
|
@ -126,6 +127,11 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private Collection<ReadFilter> filters;
|
||||
|
||||
/**
|
||||
* Controls the allocation of threads between CPU vs IO.
|
||||
*/
|
||||
private ThreadAllocation threadAllocation;
|
||||
|
||||
/**
|
||||
* A currently hacky unique name for this GATK instance
|
||||
*/
|
||||
|
|
@ -199,6 +205,9 @@ public class GenomeAnalysisEngine {
|
|||
if (this.getArguments().nonDeterministicRandomSeed)
|
||||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
||||
// Prepare the data for traversal.
|
||||
initializeDataSources();
|
||||
|
||||
|
|
@ -218,7 +227,7 @@ public class GenomeAnalysisEngine {
|
|||
// create the output streams "
|
||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||
|
||||
ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
|
||||
// execute the microscheduler, storing the results
|
||||
return microScheduler.execute(this.walker, shardStrategy);
|
||||
|
|
@ -266,6 +275,32 @@ public class GenomeAnalysisEngine {
|
|||
return Collections.unmodifiableList(filters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse out the thread allocation from the given command-line argument.
|
||||
*/
|
||||
private void determineThreadAllocation() {
|
||||
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
|
||||
|
||||
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
|
||||
Integer numCPUThreads = null;
|
||||
if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
|
||||
throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
|
||||
else if(tags.containsKey("cpu"))
|
||||
numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
|
||||
else if(argCollection.numberOfCPUThreads != null)
|
||||
numCPUThreads = argCollection.numberOfCPUThreads;
|
||||
|
||||
Integer numIOThreads = null;
|
||||
if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
|
||||
throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
|
||||
else if(tags.containsKey("io"))
|
||||
numIOThreads = Integer.parseInt(tags.getValue("io"));
|
||||
else if(argCollection.numberOfIOThreads != null)
|
||||
numIOThreads = argCollection.numberOfIOThreads;
|
||||
|
||||
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow subclasses and others within this package direct access to the walker manager.
|
||||
* @return The walker manager used by this package.
|
||||
|
|
@ -286,7 +321,7 @@ public class GenomeAnalysisEngine {
|
|||
throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given");
|
||||
}
|
||||
|
||||
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),this.getArguments().numberOfThreads);
|
||||
return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation);
|
||||
}
|
||||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
|
|
@ -397,103 +432,52 @@ public class GenomeAnalysisEngine {
|
|||
* @param intervals intervals
|
||||
* @return the sharding strategy
|
||||
*/
|
||||
protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
||||
protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
||||
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
|
||||
// Use monolithic sharding if no index is present. Monolithic sharding is always required for the original
|
||||
// sharding system; it's required with the new sharding system only for locus walkers.
|
||||
if(readsDataSource != null && !readsDataSource.hasIndex() ) {
|
||||
if(!exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
|
||||
|
||||
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
|
||||
if(!readsDataSource.isEmpty()) {
|
||||
if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
|
||||
throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
|
||||
if(intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
||||
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
|
||||
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
|
||||
|
||||
Shard.ShardType shardType;
|
||||
if(walker instanceof LocusWalker) {
|
||||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
shardType = Shard.ShardType.LOCUS;
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
|
||||
}
|
||||
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
|
||||
// Apply special validation to read pair walkers.
|
||||
if(walker instanceof ReadPairWalker) {
|
||||
if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker.");
|
||||
if(intervals != null && !intervals.isEmpty())
|
||||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
}
|
||||
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
|
||||
}
|
||||
else if(walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker)
|
||||
shardType = Shard.ShardType.READ;
|
||||
else
|
||||
throw new UserException.CommandLineException("The GATK cannot currently process unindexed BAM files");
|
||||
|
||||
List<GenomeLoc> region;
|
||||
if(intervals != null)
|
||||
region = intervals.toList();
|
||||
else {
|
||||
region = new ArrayList<GenomeLoc>();
|
||||
for(SAMSequenceRecord sequenceRecord: drivingDataSource.getSequenceDictionary().getSequences())
|
||||
region.add(getGenomeLocParser().createGenomeLoc(sequenceRecord.getSequenceName(),1,sequenceRecord.getSequenceLength()));
|
||||
}
|
||||
|
||||
return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region);
|
||||
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
|
||||
}
|
||||
else {
|
||||
// TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well
|
||||
// TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard
|
||||
// TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB]
|
||||
final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;
|
||||
if(intervals == null)
|
||||
return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
|
||||
else
|
||||
return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
|
||||
}
|
||||
|
||||
ShardStrategy shardStrategy;
|
||||
ShardStrategyFactory.SHATTER_STRATEGY shardType;
|
||||
|
||||
long SHARD_SIZE = 100000L;
|
||||
|
||||
if (walker instanceof LocusWalker) {
|
||||
if (walker instanceof RodWalker) SHARD_SIZE *= 1000;
|
||||
|
||||
if (intervals != null && !intervals.isEmpty()) {
|
||||
if (readsDataSource == null)
|
||||
throw new IllegalArgumentException("readsDataSource is null");
|
||||
if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser(),
|
||||
intervals);
|
||||
} else
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,getGenomeLocParser());
|
||||
} else if (walker instanceof ReadWalker ||
|
||||
walker instanceof DuplicateWalker) {
|
||||
shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
|
||||
|
||||
if (intervals != null && !intervals.isEmpty()) {
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser(),
|
||||
intervals);
|
||||
} else {
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser());
|
||||
}
|
||||
} else if (walker instanceof ReadPairWalker) {
|
||||
if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file.");
|
||||
if(intervals != null && !intervals.isEmpty())
|
||||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
getGenomeLocParser());
|
||||
} else
|
||||
throw new ReviewedStingException("Unable to support walker of type" + walker.getClass().getName());
|
||||
|
||||
return shardStrategy;
|
||||
}
|
||||
|
||||
protected boolean flashbackData() {
|
||||
|
|
@ -751,6 +735,8 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
return new SAMDataSource(
|
||||
samReaderIDs,
|
||||
threadAllocation,
|
||||
argCollection.numberOfBAMFileHandles,
|
||||
genomeLocParser,
|
||||
argCollection.useOriginalBaseQualities,
|
||||
argCollection.strictnessLevel,
|
||||
|
|
@ -763,8 +749,7 @@ public class GenomeAnalysisEngine {
|
|||
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
||||
getWalkerBAQQualityMode(),
|
||||
refReader,
|
||||
argCollection.defaultBaseQualities,
|
||||
!argCollection.disableLowMemorySharding);
|
||||
argCollection.defaultBaseQualities);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ public class ReadProperties {
|
|||
private Collection<SAMReaderID> readers = null;
|
||||
private SAMFileHeader header = null;
|
||||
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
|
||||
private Integer readBufferSize = null;
|
||||
private DownsamplingMethod downsamplingMethod = null;
|
||||
private ValidationExclusion exclusionList = null;
|
||||
private Collection<ReadFilter> supplementalFilters = null;
|
||||
|
|
@ -91,14 +90,6 @@ public class ReadProperties {
|
|||
return validationStringency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a list of the total number of reads that the sharding system should buffer per BAM file.
|
||||
* @return
|
||||
*/
|
||||
public Integer getReadBufferSize() {
|
||||
return readBufferSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the method and parameters used when downsampling reads.
|
||||
* @return Downsample fraction.
|
||||
|
|
@ -150,7 +141,6 @@ public class ReadProperties {
|
|||
* @param header sam file header.
|
||||
* @param useOriginalBaseQualities True if original base qualities should be used.
|
||||
* @param strictness Stringency of reads file parsing.
|
||||
* @param readBufferSize Number of reads to hold in memory per BAM.
|
||||
* @param downsamplingMethod Method for downsampling reads at a given locus.
|
||||
* @param exclusionList what safety checks we're willing to let slide
|
||||
* @param supplementalFilters additional filters to dynamically apply.
|
||||
|
|
@ -169,7 +159,6 @@ public class ReadProperties {
|
|||
SAMFileHeader header,
|
||||
boolean useOriginalBaseQualities,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
Integer readBufferSize,
|
||||
DownsamplingMethod downsamplingMethod,
|
||||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
|
|
@ -181,7 +170,6 @@ public class ReadProperties {
|
|||
byte defaultBaseQualities) {
|
||||
this.readers = samFiles;
|
||||
this.header = header;
|
||||
this.readBufferSize = readBufferSize;
|
||||
this.validationStringency = strictness;
|
||||
this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
|
||||
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
|
||||
|
|
|
|||
|
|
@ -194,10 +194,25 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
||||
public ValidationExclusion.TYPE unsafe;
|
||||
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false)
|
||||
public int numberOfThreads = 1;
|
||||
/** How many threads should be allocated to this analysis. */
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
|
||||
public Integer numberOfThreads = 1;
|
||||
|
||||
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line", required = false)
|
||||
/**
|
||||
* The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
|
||||
* TODO: Kill this when I can do a tagged integer in Queue.
|
||||
*/
|
||||
@Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
|
||||
@Hidden
|
||||
public Integer numberOfCPUThreads = null;
|
||||
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
|
||||
@Hidden
|
||||
public Integer numberOfIOThreads = null;
|
||||
|
||||
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
|
||||
public Integer numberOfBAMFileHandles = null;
|
||||
|
||||
@Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching <TAG>:<STRING> or a .txt file containing the filter strings one per line.", required = false)
|
||||
public List<String> readGroupBlackList = null;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -292,9 +307,6 @@ public class GATKArgumentCollection {
|
|||
@Hidden
|
||||
public boolean allowIntervalsWithUnindexedBAM = false;
|
||||
|
||||
@Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false)
|
||||
public boolean disableLowMemorySharding = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods
|
||||
|
|
@ -365,7 +377,19 @@ public class GATKArgumentCollection {
|
|||
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
|
||||
return false;
|
||||
}
|
||||
if (other.numberOfThreads != this.numberOfThreads) {
|
||||
if (!other.numberOfThreads.equals(this.numberOfThreads)) {
|
||||
return false;
|
||||
}
|
||||
if ((this.numberOfCPUThreads == null && other.numberOfCPUThreads != null) ||
|
||||
this.numberOfCPUThreads.equals(other.numberOfCPUThreads) ) {
|
||||
return false;
|
||||
}
|
||||
if ((this.numberOfIOThreads == null && other.numberOfIOThreads != null) ||
|
||||
this.numberOfIOThreads.equals(other.numberOfIOThreads) ) {
|
||||
return false;
|
||||
}
|
||||
if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
|
||||
(other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
|
||||
return false;
|
||||
}
|
||||
if (other.intervalMerging != this.intervalMerging) {
|
||||
|
|
@ -389,9 +413,6 @@ public class GATKArgumentCollection {
|
|||
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
|
||||
return false;
|
||||
|
||||
if (disableLowMemorySharding != other.disableLowMemorySharding)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ public class AlignmentContextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup, String assumedSingleSample) {
|
||||
public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup) {
|
||||
return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
// grab the ROD iterator from the data source, and compute the first location in this shard, forwarding
|
||||
// the iterator to immediately before it, so that it can be added to the merging iterator primed for
|
||||
// next() to return the first real ROD in this shard
|
||||
LocationAwareSeekableRODIterator it = dataSource.seek(provider.getShard());
|
||||
LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus());
|
||||
it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1));
|
||||
|
||||
states.add(new ReferenceOrderedDataState(dataSource,it));
|
||||
|
|
|
|||
|
|
@ -1,128 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 7, 2011
|
||||
* Time: 2:46:34 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class BAMBlockStartIterator implements Iterator<Long> {
|
||||
/**
|
||||
* How large is a BGZF header?
|
||||
*/
|
||||
private static int BGZF_HEADER_SIZE = 18;
|
||||
|
||||
/**
|
||||
* Where within the header does the BLOCKSIZE actually live?
|
||||
*/
|
||||
private static int BLOCK_SIZE_HEADER_POSITION = BGZF_HEADER_SIZE - 2;
|
||||
|
||||
private FileChannel bamInputChannel;
|
||||
private ByteBuffer headerByteBuffer;
|
||||
|
||||
private long nextLocation = 0;
|
||||
|
||||
public BAMBlockStartIterator(File bamFile) {
|
||||
try {
|
||||
FileInputStream bamInputStream = new FileInputStream(bamFile);
|
||||
bamInputChannel = bamInputStream.getChannel();
|
||||
|
||||
headerByteBuffer = ByteBuffer.allocate(BGZF_HEADER_SIZE);
|
||||
headerByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not open file",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextLocation != -1;
|
||||
}
|
||||
|
||||
public Long next() {
|
||||
long currentLocation = nextLocation;
|
||||
advance();
|
||||
return currentLocation;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a BAMBlockStartIterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
int readStatus;
|
||||
|
||||
headerByteBuffer.clear();
|
||||
try {
|
||||
readStatus = bamInputChannel.read(headerByteBuffer);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not read header data",ex);
|
||||
}
|
||||
|
||||
if(readStatus == -1) {
|
||||
nextLocation = -1;
|
||||
try {
|
||||
bamInputChannel.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not close input file",ex);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
headerByteBuffer.position(BLOCK_SIZE_HEADER_POSITION);
|
||||
int blockSize = headerByteBuffer.getShort();
|
||||
|
||||
try {
|
||||
bamInputChannel.position(bamInputChannel.position()+blockSize-BGZF_HEADER_SIZE+1);
|
||||
nextLocation = bamInputChannel.position();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Could not reposition input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String argv[]) throws IOException {
|
||||
BAMBlockStartIterator blockStartIterator = new BAMBlockStartIterator(new File("/Users/mhanna/testdata/reads/MV1994.bam"));
|
||||
int i = 0;
|
||||
while(blockStartIterator.hasNext())
|
||||
System.out.printf("%d -> %d%n",i++,blockStartIterator.next());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBin;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.LinearIndex;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Represents the contents of a bam index file for one reference.
|
||||
* A BAM index (.bai) file contains information for all references in the bam file.
|
||||
* This class describes the data present in the index file for one of these references;
|
||||
* including the bins, chunks, and linear index.
|
||||
*/
|
||||
class BAMIndexContent {
|
||||
/**
|
||||
* The reference sequence for the data currently loaded.
|
||||
*/
|
||||
private final int mReferenceSequence;
|
||||
|
||||
/**
|
||||
* A list of all bins in the above reference sequence.
|
||||
*/
|
||||
private final BinList mBinList;
|
||||
|
||||
/**
|
||||
* The linear index for the reference sequence above.
|
||||
*/
|
||||
private final LinearIndex mLinearIndex;
|
||||
|
||||
|
||||
/**
|
||||
* @param referenceSequence Content corresponds to this reference.
|
||||
* @param bins Array of bins represented by this content, possibly sparse
|
||||
* @param numberOfBins Number of non-null bins
|
||||
* @param linearIndex Additional index used to optimize queries
|
||||
*/
|
||||
BAMIndexContent(final int referenceSequence, final GATKBin[] bins, final int numberOfBins, final LinearIndex linearIndex) {
|
||||
this.mReferenceSequence = referenceSequence;
|
||||
this.mBinList = new BinList(bins, numberOfBins);
|
||||
this.mLinearIndex = linearIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reference for this Content
|
||||
*/
|
||||
public int getReferenceSequence() {
|
||||
return mReferenceSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this content have anything in this bin?
|
||||
*/
|
||||
public boolean containsBin(final GATKBin bin) {
|
||||
return mBinList.getBin(bin.getBinNumber()) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return iterable list of bins represented by this content
|
||||
*/
|
||||
public BinList getBins() {
|
||||
return mBinList;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of non-null bins represented by this content
|
||||
*/
|
||||
int getNumberOfNonNullBins() {
|
||||
return mBinList.getNumberOfNonNullBins();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return all chunks associated with all bins in this content
|
||||
*/
|
||||
public List<GATKChunk> getAllChunks() {
|
||||
List<GATKChunk> allChunks = new ArrayList<GATKChunk>();
|
||||
for (GATKBin b : mBinList)
|
||||
if (b.getChunkList() != null) {
|
||||
allChunks.addAll(Arrays.asList(b.getChunkList()));
|
||||
}
|
||||
return Collections.unmodifiableList(allChunks);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the linear index represented by this content
|
||||
*/
|
||||
public LinearIndex getLinearIndex() {
|
||||
return mLinearIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is used to encapsulate the list of Bins store in the BAMIndexContent
|
||||
* While it is currently represented as an array, we may decide to change it to an ArrayList or other structure
|
||||
*/
|
||||
class BinList implements Iterable<GATKBin> {
|
||||
|
||||
private final GATKBin[] mBinArray;
|
||||
public final int numberOfNonNullBins;
|
||||
public final int maxBinNumber; // invariant: maxBinNumber = mBinArray.length -1 since array is 0 based
|
||||
|
||||
/**
|
||||
* @param binArray a sparse array representation of the bins. The index into the array is the bin number.
|
||||
* @param numberOfNonNullBins
|
||||
*/
|
||||
BinList(GATKBin[] binArray, int numberOfNonNullBins) {
|
||||
this.mBinArray = binArray;
|
||||
this.numberOfNonNullBins = numberOfNonNullBins;
|
||||
this.maxBinNumber = mBinArray.length - 1;
|
||||
}
|
||||
|
||||
GATKBin getBin(int binNumber) {
|
||||
if (binNumber > maxBinNumber) return null;
|
||||
return mBinArray[binNumber];
|
||||
}
|
||||
|
||||
int getNumberOfNonNullBins() {
|
||||
return numberOfNonNullBins;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an iterator over all non-null bins.
|
||||
*
|
||||
* @return An iterator over all bins.
|
||||
*/
|
||||
public Iterator<GATKBin> iterator() {
|
||||
return new BinIterator();
|
||||
}
|
||||
|
||||
private class BinIterator implements Iterator<GATKBin> {
|
||||
/**
|
||||
* Stores the bin # of the Bin currently in use.
|
||||
*/
|
||||
private int nextBin;
|
||||
|
||||
public BinIterator() {
|
||||
nextBin = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are there more bins in this set, waiting to be returned?
|
||||
*
|
||||
* @return True if more bins are remaining.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
while (nextBin <= maxBinNumber) {
|
||||
if (getBin(nextBin) != null) return true;
|
||||
nextBin++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next bin in the provided BinList.
|
||||
*
|
||||
* @return the next available bin in the BinList.
|
||||
*/
|
||||
public GATKBin next() {
|
||||
if (!hasNext())
|
||||
throw new NoSuchElementException("This BinIterator is currently empty");
|
||||
GATKBin result = getBin(nextBin);
|
||||
nextBin++;
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from a bin iterator");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.Bin;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Models a bin at which all BAM files in the merged input stream overlap.
|
||||
*/
|
||||
class BAMOverlap {
|
||||
public final int start;
|
||||
public final int stop;
|
||||
|
||||
private final Map<SAMReaderID,Bin> bins = new HashMap<SAMReaderID,Bin>();
|
||||
|
||||
public BAMOverlap(final int start, final int stop) {
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
}
|
||||
|
||||
public void addBin(final SAMReaderID id, final Bin bin) {
|
||||
bins.put(id,bin);
|
||||
}
|
||||
|
||||
public Bin getBin(final SAMReaderID id) {
|
||||
return bins.get(id);
|
||||
}
|
||||
}
|
||||
|
|
@ -84,21 +84,21 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
|
||||
/**
|
||||
* Create a new BAM schedule based on the given index.
|
||||
* @param indexFiles Index files.
|
||||
* @param dataSource The SAM data source to use.
|
||||
* @param intervals List of
|
||||
*/
|
||||
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indexFiles, final List<GenomeLoc> intervals) {
|
||||
public BAMSchedule(final SAMDataSource dataSource, final List<GenomeLoc> intervals) {
|
||||
if(intervals.isEmpty())
|
||||
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
||||
|
||||
referenceSequence = intervals.get(0).getContigIndex();
|
||||
referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();
|
||||
|
||||
createScheduleFile();
|
||||
|
||||
readerIDs.addAll(indexFiles.keySet());
|
||||
readerIDs.addAll(dataSource.getReaderIDs());
|
||||
|
||||
for(final SAMReaderID reader: readerIDs) {
|
||||
final GATKBAMIndex index = indexFiles.get(reader);
|
||||
final GATKBAMIndex index = dataSource.getIndex(reader);
|
||||
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
|
||||
|
||||
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
||||
|
|
@ -237,7 +237,10 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
if(selectedIterators.isEmpty())
|
||||
return;
|
||||
|
||||
// Create the target schedule entry
|
||||
BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);
|
||||
|
||||
// For each schedule entry with data, load the data into the merged schedule.
|
||||
for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
|
||||
PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
|
||||
BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
|
||||
|
|
@ -248,6 +251,11 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
scheduleIterator.next();
|
||||
}
|
||||
|
||||
// For each schedule entry without data, add a blank entry.
|
||||
for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
|
||||
mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
|
||||
}
|
||||
|
||||
nextScheduleEntry = mergedScheduleEntry;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,7 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -42,21 +47,86 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
private FilePointer nextFilePointer = null;
|
||||
|
||||
private final GenomeLocSortedSet loci;
|
||||
private GenomeLocSortedSet loci;
|
||||
private PeekableIterator<GenomeLoc> locusIterator;
|
||||
private GenomeLoc currentLocus;
|
||||
|
||||
private final PeekableIterator<GenomeLoc> locusIterator;
|
||||
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
|
||||
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
|
||||
// Match only on sequence name; trust startup validation to make sure all the sequences match.
|
||||
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
|
||||
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
|
||||
}
|
||||
scheduler.populateFilteredIntervalList(intervals);
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
private GenomeLoc currentLocus;
|
||||
public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
scheduler.populateUnfilteredIntervalList(parser);
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
scheduler.populateFilteredIntervalList(loci);
|
||||
return scheduler;
|
||||
}
|
||||
|
||||
|
||||
private BAMScheduler(final SAMDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs()) {
|
||||
GATKBAMIndex index = dataSource.getIndex(reader);
|
||||
if(index != null)
|
||||
indexFiles.put(reader,dataSource.getIndex(reader));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The consumer has asked for a bounded set of locations. Prepare an iterator over those locations.
|
||||
* @param loci The list of locations to search and iterate over.
|
||||
*/
|
||||
private void populateFilteredIntervalList(final GenomeLocSortedSet loci) {
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
if(!indexFiles.isEmpty()) {
|
||||
// If index data is available, start up the iterator.
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
currentLocus = locusIterator.next();
|
||||
advance();
|
||||
}
|
||||
else {
|
||||
// Otherwise, seed the iterator with a single file pointer over the entire region.
|
||||
nextFilePointer = generatePointerOverEntireFileset();
|
||||
for(GenomeLoc locus: loci)
|
||||
nextFilePointer.addLocation(locus);
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching
|
||||
* from just before the start of the region to the end of the region.
|
||||
*/
|
||||
private void populateUnfilteredIntervalList(final GenomeLocParser parser) {
|
||||
this.loci = new GenomeLocSortedSet(parser);
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(Collections.<GenomeLoc>emptyList().iterator());
|
||||
nextFilePointer = generatePointerOverEntireFileset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a span that runs from the end of the BAM header to the end of the fle.
|
||||
* @return A file pointer over the specified region.
|
||||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
|
||||
return filePointer;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
@ -67,7 +137,9 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next element available in interval sharder");
|
||||
FilePointer currentFilePointer = nextFilePointer;
|
||||
nextFilePointer = null;
|
||||
advance();
|
||||
|
||||
return currentFilePointer;
|
||||
}
|
||||
|
||||
|
|
@ -79,13 +151,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
if(loci.isEmpty())
|
||||
return;
|
||||
|
||||
nextFilePointer = null;
|
||||
while(nextFilePointer == null && currentLocus != null) {
|
||||
// special case handling of the unmapped shard.
|
||||
if(currentLocus == GenomeLoc.UNMAPPED) {
|
||||
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
nextFilePointer.addFileSpans(id,new GATKBAMFileSpan(new GATKChunk(indexFiles.get(id).getStartOfLastLinearBin(),Long.MAX_VALUE)));
|
||||
nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
|
||||
currentLocus = null;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -96,7 +167,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
int coveredRegionStop = Integer.MAX_VALUE;
|
||||
GenomeLoc coveredRegion = null;
|
||||
|
||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus);
|
||||
BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus);
|
||||
|
||||
// No overlapping data at all.
|
||||
if(scheduleEntry != null) {
|
||||
|
|
@ -108,7 +179,6 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
}
|
||||
else {
|
||||
// Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty.
|
||||
//System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex());
|
||||
for(SAMReaderID reader: indexFiles.keySet())
|
||||
nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan());
|
||||
}
|
||||
|
|
@ -116,21 +186,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
// Early exit if no bins were found.
|
||||
if(coveredRegion == null) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
if(currentLocus.size() > 16384) {
|
||||
GenomeLoc[] splitContigs = currentLocus.split(currentLocus.getStart()+16384);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
currentLocus = splitContigs[1];
|
||||
}
|
||||
else {
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
}
|
||||
nextFilePointer.addLocation(currentLocus);
|
||||
currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Early exit if only part of the first interval was found.
|
||||
if(currentLocus.startsBefore(coveredRegion)) {
|
||||
// for debugging only: maximum split is 16384.
|
||||
int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart();
|
||||
GenomeLoc[] splitContigs = currentLocus.split(splitPoint);
|
||||
nextFilePointer.addLocation(splitContigs[0]);
|
||||
|
|
@ -175,25 +237,30 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
/**
|
||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||
* @param indices BAM indices.
|
||||
* @param currentLocus The actual locus for which to check overlap.
|
||||
* @return The next schedule entry overlapping with the given list of loci.
|
||||
*/
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) {
|
||||
// Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name.
|
||||
// This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then
|
||||
// we'll be using the correct contig index for the BAMs.
|
||||
// TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing.
|
||||
final int currentContigIndex = dataSource.getHeader().getSequence(currentLocus.getContig()).getSequenceIndex();
|
||||
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) {
|
||||
if(bamScheduleIterator != null)
|
||||
bamScheduleIterator.close();
|
||||
lastReferenceSequenceLoaded = currentLocus.getContigIndex();
|
||||
lastReferenceSequenceLoaded = currentContigIndex;
|
||||
|
||||
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
||||
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
||||
for(GenomeLoc locus: loci) {
|
||||
if(locus.getContigIndex() == lastReferenceSequenceLoaded)
|
||||
if(dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
|
||||
lociInContig.add(locus);
|
||||
}
|
||||
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(dataSource,lociInContig));
|
||||
}
|
||||
|
||||
if(!bamScheduleIterator.hasNext())
|
||||
|
|
@ -209,4 +276,13 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a span from the given start point to the end of the file.
|
||||
* @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset).
|
||||
* @return A file span from the given point to the end of the file.
|
||||
*/
|
||||
private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) {
|
||||
return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* Preloads BGZF blocks in preparation for unzipping and data processing.
|
||||
* TODO: Right now, the block loader has all threads blocked waiting for a work request. Ultimately this should
|
||||
* TODO: be replaced with a central thread management strategy.
|
||||
*/
|
||||
public class BGZFBlockLoadingDispatcher {
|
||||
/**
|
||||
* The file handle cache, used when allocating blocks from the dispatcher.
|
||||
*/
|
||||
private final FileHandleCache fileHandleCache;
|
||||
|
||||
private final ExecutorService threadPool;
|
||||
|
||||
private final Queue<SAMReaderPosition> inputQueue;
|
||||
|
||||
public BGZFBlockLoadingDispatcher(final int numThreads, final int numFileHandles) {
|
||||
threadPool = Executors.newFixedThreadPool(numThreads);
|
||||
fileHandleCache = new FileHandleCache(numFileHandles);
|
||||
inputQueue = new LinkedList<SAMReaderPosition>();
|
||||
|
||||
threadPool.execute(new BlockLoader(this,fileHandleCache,true));
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a request for a new block load.
|
||||
* @param readerPosition Position at which to load.
|
||||
*/
|
||||
void queueBlockLoad(final SAMReaderPosition readerPosition) {
|
||||
synchronized(inputQueue) {
|
||||
inputQueue.add(readerPosition);
|
||||
inputQueue.notify();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Claims the next work request from the queue.
|
||||
* @return The next work request, or null if none is available.
|
||||
*/
|
||||
SAMReaderPosition claimNextWorkRequest() {
|
||||
synchronized(inputQueue) {
|
||||
while(inputQueue.isEmpty()) {
|
||||
try {
|
||||
inputQueue.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
throw new ReviewedStingException("Interrupt occurred waiting for next block reader work item");
|
||||
}
|
||||
}
|
||||
return inputQueue.poll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,436 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.util.BAMInputStream;
|
||||
import net.sf.samtools.util.BlockCompressedFilePointerUtil;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.RuntimeEOFException;
|
||||
import net.sf.samtools.util.SeekableStream;
|
||||
import org.broad.tribble.util.BlockCompressedStreamConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* Presents decompressed blocks to the SAMFileReader.
|
||||
*/
|
||||
public class BlockInputStream extends SeekableStream implements BAMInputStream {
|
||||
/**
|
||||
* Mechanism for triggering block loads.
|
||||
*/
|
||||
private final BGZFBlockLoadingDispatcher dispatcher;
|
||||
|
||||
/**
|
||||
* The reader whose data is supplied by this input stream.
|
||||
*/
|
||||
private final SAMReaderID reader;
|
||||
|
||||
/**
|
||||
* Length of the input stream.
|
||||
*/
|
||||
private final long length;
|
||||
|
||||
/**
|
||||
* The latest error reported by an asynchronous block load.
|
||||
*/
|
||||
private Throwable error;
|
||||
|
||||
/**
|
||||
* Current position.
|
||||
*/
|
||||
private SAMReaderPosition position;
|
||||
|
||||
/**
|
||||
* A stream of compressed data blocks.
|
||||
*/
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
/**
|
||||
* Offsets of the given blocks in the buffer.
|
||||
*/
|
||||
private LinkedList<Integer> blockOffsets = new LinkedList<Integer>();
|
||||
|
||||
/**
|
||||
* Source positions of the given blocks in the buffer.
|
||||
*/
|
||||
private LinkedList<Long> blockPositions = new LinkedList<Long>();
|
||||
|
||||
/**
|
||||
* Provides a lock to wait for more data to arrive.
|
||||
*/
|
||||
private final Object lock = new Object();
|
||||
|
||||
/**
|
||||
* An input stream to use when comparing data back to what it should look like.
|
||||
*/
|
||||
private final BlockCompressedInputStream validatingInputStream;
|
||||
|
||||
/**
|
||||
* Has the buffer been filled since last request?
|
||||
*/
|
||||
private boolean bufferFilled = false;
|
||||
|
||||
/**
|
||||
* Create a new block presenting input stream with a dedicated buffer.
|
||||
* @param dispatcher the block loading messenger.
|
||||
* @param reader the reader for which to load data.
|
||||
* @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream.
|
||||
*/
|
||||
BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) {
|
||||
this.reader = reader;
|
||||
this.length = reader.samFile.length();
|
||||
|
||||
buffer = ByteBuffer.wrap(new byte[64*1024]);
|
||||
buffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
// The state of the buffer assumes that the range of data written into the buffer appears in the range
|
||||
// [position,limit), while extra capacity exists in the range [limit,capacity)
|
||||
buffer.limit(0);
|
||||
|
||||
this.dispatcher = dispatcher;
|
||||
// TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream.
|
||||
this.position = new SAMReaderPosition(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE)));
|
||||
|
||||
try {
|
||||
if(validate) {
|
||||
System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this);
|
||||
validatingInputStream = new BlockCompressedInputStream(reader.samFile);
|
||||
// A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE.
|
||||
// Poke the stream to start reading data.
|
||||
validatingInputStream.available();
|
||||
}
|
||||
else
|
||||
validatingInputStream = null;
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
public long length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public long getFilePointer() {
|
||||
long filePointer;
|
||||
synchronized(lock) {
|
||||
if(buffer.remaining() > 0) {
|
||||
// If there's data in the buffer, figure out from whence it came.
|
||||
final long blockAddress = blockPositions.size() > 0 ? blockPositions.get(0) : 0;
|
||||
final int blockOffset = buffer.position();
|
||||
filePointer = blockAddress << 16 | blockOffset;
|
||||
}
|
||||
else {
|
||||
// Otherwise, find the next position to load.
|
||||
filePointer = position.getBlockAddress() << 16;
|
||||
}
|
||||
}
|
||||
|
||||
if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer())
|
||||
throw new ReviewedStingException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)",
|
||||
BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer),
|
||||
BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer())));
|
||||
|
||||
return filePointer;
|
||||
}
|
||||
|
||||
public void seek(long target) {
|
||||
// TODO: Validate the seek point.
|
||||
//System.out.printf("Thread %s, BlockInputStream %s: seeking to block %d, offset %d%n",Thread.currentThread().getId(),this,BlockCompressedFilePointerUtil.getBlockAddress(target),BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||
synchronized(lock) {
|
||||
clearBuffers();
|
||||
position.advancePosition(BlockCompressedFilePointerUtil.getBlockAddress(target));
|
||||
waitForBufferFill();
|
||||
buffer.position(BlockCompressedFilePointerUtil.getBlockOffset(target));
|
||||
|
||||
if(validatingInputStream != null) {
|
||||
try {
|
||||
validatingInputStream.seek(target);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void clearBuffers() {
|
||||
this.position.reset();
|
||||
|
||||
// Buffer semantics say that outside of a lock, buffer should always be prepared for reading.
|
||||
// Indicate no data to be read.
|
||||
buffer.clear();
|
||||
buffer.limit(0);
|
||||
|
||||
blockOffsets.clear();
|
||||
blockPositions.clear();
|
||||
}
|
||||
|
||||
public boolean eof() {
|
||||
synchronized(lock) {
|
||||
// TODO: Handle multiple empty BGZF blocks at end of the file.
|
||||
return position != null && position.getBlockAddress() >= length;
|
||||
}
|
||||
}
|
||||
|
||||
public void setCheckCrcs(final boolean check) {
|
||||
// TODO: Implement
|
||||
}
|
||||
|
||||
/**
|
||||
* Submits a new access plan for the given dataset.
|
||||
* @param position The next seek point for BAM data in this reader.
|
||||
*/
|
||||
public void submitAccessPlan(final SAMReaderPosition position) {
|
||||
//System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress());
|
||||
synchronized(lock) {
|
||||
// Assume that the access plan is going to tell us to start where we are and move forward.
|
||||
// If this isn't the case, we'll soon receive a seek request and the buffer will be forced to reset.
|
||||
if(this.position != null && position.getBlockAddress() < this.position.getBlockAddress())
|
||||
position.advancePosition(this.position.getBlockAddress());
|
||||
}
|
||||
this.position = position;
|
||||
}
|
||||
|
||||
private void compactBuffer() {
|
||||
// Compact buffer to maximize storage space.
|
||||
int bytesToRemove = 0;
|
||||
|
||||
// Look ahead to see if we can compact away the first block in the series.
|
||||
while(blockOffsets.size() > 1 && buffer.position() < blockOffsets.get(1)) {
|
||||
bytesToRemove += blockOffsets.remove();
|
||||
blockPositions.remove();
|
||||
}
|
||||
|
||||
// If we end up with an empty block at the end of the series, compact this as well.
|
||||
if(buffer.remaining() == 0 && !blockOffsets.isEmpty() && buffer.position() >= blockOffsets.peek()) {
|
||||
bytesToRemove += buffer.position();
|
||||
blockOffsets.remove();
|
||||
blockPositions.remove();
|
||||
}
|
||||
|
||||
int finalBufferStart = buffer.position() - bytesToRemove;
|
||||
int finalBufferSize = buffer.remaining();
|
||||
|
||||
buffer.position(bytesToRemove);
|
||||
buffer.compact();
|
||||
|
||||
buffer.position(finalBufferStart);
|
||||
buffer.limit(finalBufferStart+finalBufferSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Push contents of incomingBuffer into the end of this buffer.
|
||||
* MUST be called from a thread that is NOT the reader thread.
|
||||
* @param incomingBuffer The data being pushed into this input stream.
|
||||
* @param position target position for the data.
|
||||
*/
|
||||
public void copyIntoBuffer(final ByteBuffer incomingBuffer, final SAMReaderPosition position, final long filePosition) {
|
||||
synchronized(lock) {
|
||||
try {
|
||||
compactBuffer();
|
||||
// Open up the buffer for more reading.
|
||||
buffer.limit(buffer.capacity());
|
||||
|
||||
// Advance the position to take the most recent read into account.
|
||||
long lastReadPosition = position.getBlockAddress();
|
||||
|
||||
byte[] validBytes = null;
|
||||
if(validatingInputStream != null) {
|
||||
validBytes = new byte[incomingBuffer.remaining()];
|
||||
|
||||
byte[] currentBytes = new byte[incomingBuffer.remaining()];
|
||||
int pos = incomingBuffer.position();
|
||||
int lim = incomingBuffer.limit();
|
||||
incomingBuffer.get(currentBytes);
|
||||
|
||||
incomingBuffer.limit(lim);
|
||||
incomingBuffer.position(pos);
|
||||
|
||||
long currentFilePointer = validatingInputStream.getFilePointer();
|
||||
validatingInputStream.seek(lastReadPosition << 16);
|
||||
validatingInputStream.read(validBytes);
|
||||
validatingInputStream.seek(currentFilePointer);
|
||||
|
||||
if(!Arrays.equals(validBytes,currentBytes))
|
||||
throw new ReviewedStingException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this));
|
||||
}
|
||||
|
||||
this.position = position;
|
||||
position.advancePosition(filePosition);
|
||||
|
||||
if(buffer.remaining() < incomingBuffer.remaining()) {
|
||||
//System.out.printf("Thread %s: waiting for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n",Thread.currentThread().getId(),buffer.remaining(),incomingBuffer.remaining());
|
||||
lock.wait();
|
||||
//System.out.printf("Thread %s: waited for available space in buffer; buffer remaining = %d, incoming buffer remaining = %d%n", Thread.currentThread().getId(), buffer.remaining(), incomingBuffer.remaining());
|
||||
}
|
||||
|
||||
// Queue list of block offsets / block positions.
|
||||
blockOffsets.add(buffer.position());
|
||||
blockPositions.add(lastReadPosition);
|
||||
|
||||
buffer.put(incomingBuffer);
|
||||
|
||||
// Set up the buffer for reading.
|
||||
buffer.flip();
|
||||
bufferFilled = true;
|
||||
|
||||
lock.notify();
|
||||
}
|
||||
catch(Exception ex) {
|
||||
reportException(ex);
|
||||
lock.notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reportException(Throwable t) {
|
||||
synchronized(lock) {
|
||||
this.error = t;
|
||||
lock.notify();
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForErrors() {
|
||||
synchronized(lock) {
|
||||
if(error != null) {
|
||||
ReviewedStingException toThrow = new ReviewedStingException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error);
|
||||
toThrow.setStackTrace(error.getStackTrace());
|
||||
throw toThrow;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next byte of data from the input stream.
|
||||
* @return Next byte of data, from 0->255, as an int.
|
||||
*/
|
||||
@Override
|
||||
public int read() {
|
||||
byte[] singleByte = new byte[1];
|
||||
read(singleByte);
|
||||
return singleByte[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the given byte array to the extent possible.
|
||||
* @param bytes byte array to be filled.
|
||||
* @return The number of bytes actually read.
|
||||
*/
|
||||
@Override
|
||||
public int read(byte[] bytes) {
|
||||
return read(bytes,0,bytes.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] bytes, final int offset, final int length) {
|
||||
int remaining = length;
|
||||
synchronized(lock) {
|
||||
while(remaining > 0) {
|
||||
// Check for error conditions during last read.
|
||||
checkForErrors();
|
||||
|
||||
// If completely out of space, queue up another buffer fill.
|
||||
waitForBufferFill();
|
||||
|
||||
// Couldn't manage to load any data at all; abort and return what's available.
|
||||
if(buffer.remaining() == 0)
|
||||
break;
|
||||
|
||||
int numBytesToCopy = Math.min(buffer.remaining(),remaining);
|
||||
buffer.get(bytes,length-remaining+offset,numBytesToCopy);
|
||||
remaining -= numBytesToCopy;
|
||||
|
||||
//if(remaining > 0)
|
||||
// System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length);
|
||||
// TODO: Assert that we don't copy across a block boundary
|
||||
}
|
||||
|
||||
// Notify any waiting threads that some of the contents of the buffer were removed.
|
||||
if(length-remaining > 0)
|
||||
lock.notify();
|
||||
}
|
||||
|
||||
if(validatingInputStream != null) {
|
||||
byte[] validBytes = new byte[length];
|
||||
try {
|
||||
validatingInputStream.read(validBytes,offset,length);
|
||||
for(int i = offset; i < offset+length; i++) {
|
||||
if(bytes[i] != validBytes[i]) {
|
||||
System.out.printf("Thread %s: preparing to throw an exception because contents don't match%n",Thread.currentThread().getId());
|
||||
throw new ReviewedStingException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
|
||||
return length - remaining;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if(validatingInputStream != null) {
|
||||
try {
|
||||
validatingInputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new ReviewedStingException("Unable to validate against Picard input stream",ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return reader.getSamFilePath();
|
||||
}
|
||||
|
||||
private void waitForBufferFill() {
|
||||
synchronized(lock) {
|
||||
bufferFilled = false;
|
||||
if(buffer.remaining() == 0 && !eof()) {
|
||||
//System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this);
|
||||
dispatcher.queueBlockLoad(position);
|
||||
try {
|
||||
lock.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
// TODO: handle me.
|
||||
throw new ReviewedStingException("Interrupt occurred waiting for buffer to fill",ex);
|
||||
}
|
||||
|
||||
if(bufferFilled && buffer.remaining() == 0)
|
||||
throw new RuntimeEOFException("No more data left in InputStream");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broad.tribble.util.BlockCompressedStreamConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
/**
|
||||
* An engine for loading blocks.
|
||||
*/
|
||||
class BlockLoader implements Runnable {
|
||||
/**
|
||||
* Coordinates the input queue.
|
||||
*/
|
||||
private BGZFBlockLoadingDispatcher dispatcher;
|
||||
|
||||
/**
|
||||
* A cache from which to retrieve open file handles.
|
||||
*/
|
||||
private final FileHandleCache fileHandleCache;
|
||||
|
||||
/**
|
||||
* Whether asynchronous decompression should happen.
|
||||
*/
|
||||
private final boolean decompress;
|
||||
|
||||
/**
|
||||
* An direct input buffer for incoming data from disk.
|
||||
*/
|
||||
private final ByteBuffer inputBuffer;
|
||||
|
||||
public BlockLoader(final BGZFBlockLoadingDispatcher dispatcher, final FileHandleCache fileHandleCache, final boolean decompress) {
|
||||
this.dispatcher = dispatcher;
|
||||
this.fileHandleCache = fileHandleCache;
|
||||
this.decompress = decompress;
|
||||
|
||||
this.inputBuffer = ByteBuffer.allocateDirect(64*1024 + BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length);
|
||||
inputBuffer.order(ByteOrder.LITTLE_ENDIAN);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
for(;;) {
|
||||
SAMReaderPosition readerPosition = null;
|
||||
try {
|
||||
readerPosition = dispatcher.claimNextWorkRequest();
|
||||
FileInputStream inputStream = fileHandleCache.claimFileInputStream(readerPosition.getReader());
|
||||
|
||||
long blockAddress = readerPosition.getBlockAddress();
|
||||
//System.out.printf("Thread %s: BlockLoader: copying bytes from %s at position %d into %s%n",Thread.currentThread().getId(),inputStream,blockAddress,readerPosition.getInputStream());
|
||||
|
||||
ByteBuffer compressedBlock = readBGZFBlock(inputStream,readerPosition.getBlockAddress());
|
||||
long nextBlockAddress = position(inputStream);
|
||||
fileHandleCache.releaseFileInputStream(readerPosition.getReader(),inputStream);
|
||||
|
||||
ByteBuffer block = decompress ? decompressBGZFBlock(compressedBlock) : compressedBlock;
|
||||
int bytesCopied = block.remaining();
|
||||
|
||||
BlockInputStream bamInputStream = readerPosition.getInputStream();
|
||||
bamInputStream.copyIntoBuffer(block,readerPosition,nextBlockAddress);
|
||||
|
||||
//System.out.printf("Thread %s: BlockLoader: copied %d bytes from %s at position %d into %s%n",Thread.currentThread().getId(),bytesCopied,inputStream,blockAddress,readerPosition.getInputStream());
|
||||
}
|
||||
catch(Throwable error) {
|
||||
if(readerPosition != null && readerPosition.getInputStream() != null)
|
||||
readerPosition.getInputStream().reportException(error);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private ByteBuffer readBGZFBlock(final FileInputStream inputStream, final long blockAddress) throws IOException {
|
||||
FileChannel channel = inputStream.getChannel();
|
||||
|
||||
// Read the block header
|
||||
channel.position(blockAddress);
|
||||
|
||||
int uncompressedDataSize = 0;
|
||||
int bufferSize = 0;
|
||||
|
||||
do {
|
||||
inputBuffer.clear();
|
||||
inputBuffer.limit(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
channel.read(inputBuffer);
|
||||
|
||||
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
|
||||
// value into an int buffer to transfer the bitwise contents into an int.
|
||||
inputBuffer.flip();
|
||||
if(inputBuffer.remaining() != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH)
|
||||
throw new ReviewedStingException("BUG: unable to read a the complete block header in one pass.");
|
||||
|
||||
// Verify that the file was read at a valid point.
|
||||
if(unpackUByte8(inputBuffer,0) != BlockCompressedStreamConstants.GZIP_ID1 ||
|
||||
unpackUByte8(inputBuffer,1) != BlockCompressedStreamConstants.GZIP_ID2 ||
|
||||
unpackUByte8(inputBuffer,3) != BlockCompressedStreamConstants.GZIP_FLG ||
|
||||
unpackUInt16(inputBuffer,10) != BlockCompressedStreamConstants.GZIP_XLEN ||
|
||||
unpackUByte8(inputBuffer,12) != BlockCompressedStreamConstants.BGZF_ID1 ||
|
||||
unpackUByte8(inputBuffer,13) != BlockCompressedStreamConstants.BGZF_ID2) {
|
||||
throw new ReviewedStingException("BUG: Started reading compressed block at incorrect position");
|
||||
}
|
||||
|
||||
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET);
|
||||
bufferSize = unpackUInt16(inputBuffer,BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET)+1;
|
||||
|
||||
// Adjust buffer limits and finish reading the block. Also read the next header, just in case there's a 0-byte block.
|
||||
inputBuffer.limit(bufferSize);
|
||||
inputBuffer.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
channel.read(inputBuffer);
|
||||
|
||||
// Check the uncompressed length. If 0 and not at EOF, we'll want to check the next block.
|
||||
uncompressedDataSize = inputBuffer.getInt(inputBuffer.limit()-4);
|
||||
//System.out.printf("Uncompressed block size of the current block (at position %d) is %d%n",channel.position()-inputBuffer.limit(),uncompressedDataSize);
|
||||
}
|
||||
while(uncompressedDataSize == 0 && channel.position() < channel.size());
|
||||
|
||||
// Prepare the buffer for reading.
|
||||
inputBuffer.flip();
|
||||
|
||||
return inputBuffer;
|
||||
}
|
||||
|
||||
private ByteBuffer decompressBGZFBlock(final ByteBuffer bgzfBlock) throws DataFormatException {
|
||||
final int compressedBufferSize = bgzfBlock.remaining();
|
||||
|
||||
// Determine the uncompressed buffer size (
|
||||
bgzfBlock.position(bgzfBlock.limit()-4);
|
||||
int uncompressedBufferSize = bgzfBlock.getInt();
|
||||
byte[] uncompressedContent = new byte[uncompressedBufferSize];
|
||||
|
||||
// Bound the CDATA section of the buffer.
|
||||
bgzfBlock.limit(compressedBufferSize-BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH);
|
||||
bgzfBlock.position(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH);
|
||||
byte[] compressedContent = new byte[bgzfBlock.remaining()];
|
||||
ByteBuffer.wrap(compressedContent).put(bgzfBlock);
|
||||
|
||||
// Decompress the buffer.
|
||||
final Inflater inflater = new Inflater(true);
|
||||
inflater.setInput(compressedContent);
|
||||
int bytesUncompressed = inflater.inflate(uncompressedContent);
|
||||
if(bytesUncompressed != uncompressedBufferSize)
|
||||
throw new ReviewedStingException("Error decompressing block");
|
||||
|
||||
return ByteBuffer.wrap(uncompressedContent);
|
||||
}
|
||||
|
||||
private long position(final FileInputStream inputStream) throws IOException {
|
||||
return inputStream.getChannel().position();
|
||||
}
|
||||
|
||||
private int unpackUByte8(final ByteBuffer buffer,final int position) {
|
||||
return buffer.get(position) & 0xFF;
|
||||
}
|
||||
|
||||
private int unpackUInt16(final ByteBuffer buffer,final int position) {
|
||||
// Read out the size of the full BGZF block into a two bit short container, then 'or' that
|
||||
// value into an int buffer to transfer the bitwise contents into an int.
|
||||
return buffer.getShort(position) & 0xFFFF;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
|
||||
/**
|
||||
* Caches frequently used file handles. Right now, caches only a single file handle.
|
||||
* TODO: Generalize to support arbitrary file handle caches.
|
||||
*/
|
||||
public class FileHandleCache {
|
||||
/**
|
||||
* The underlying data structure storing file handles.
|
||||
*/
|
||||
private final FileHandleStorage fileHandleStorage;
|
||||
|
||||
/**
|
||||
* How many file handles should be kept open at once.
|
||||
*/
|
||||
private final int cacheSize;
|
||||
|
||||
/**
|
||||
* A uniquifier: assign a unique ID to every instance of a file handle.
|
||||
*/
|
||||
private final Map<SAMReaderID,Integer> keyCounter = new HashMap<SAMReaderID,Integer>();
|
||||
|
||||
/**
|
||||
* A shared lock, private so that outside users cannot notify it.
|
||||
*/
|
||||
private final Object lock = new Object();
|
||||
|
||||
/**
|
||||
* Indicates how many file handles are outstanding at this point.
|
||||
*/
|
||||
private int numOutstandingFileHandles = 0;
|
||||
|
||||
/**
|
||||
* Create a new file handle cache of the given cache size.
|
||||
* @param cacheSize how many readers to hold open at once.
|
||||
*/
|
||||
public FileHandleCache(final int cacheSize) {
|
||||
this.cacheSize = cacheSize;
|
||||
fileHandleStorage = new FileHandleStorage();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves or opens a file handle for the given reader ID.
|
||||
* @param key The ke
|
||||
* @return A file input stream from the cache, if available, or otherwise newly opened.
|
||||
*/
|
||||
public FileInputStream claimFileInputStream(final SAMReaderID key) {
|
||||
synchronized(lock) {
|
||||
FileInputStream inputStream = findExistingEntry(key);
|
||||
if(inputStream == null) {
|
||||
try {
|
||||
// If the cache is maxed out, wait for another file handle to emerge.
|
||||
if(numOutstandingFileHandles >= cacheSize)
|
||||
lock.wait();
|
||||
}
|
||||
catch(InterruptedException ex) {
|
||||
throw new ReviewedStingException("Interrupted while waiting for a file handle");
|
||||
}
|
||||
inputStream = openInputStream(key);
|
||||
}
|
||||
numOutstandingFileHandles++;
|
||||
|
||||
//System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId());
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases the current reader and returns it to the cache.
|
||||
* @param key The reader.
|
||||
* @param inputStream The stream being used.
|
||||
*/
|
||||
public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) {
|
||||
synchronized(lock) {
|
||||
numOutstandingFileHandles--;
|
||||
UniqueKey newID = allocateKey(key);
|
||||
fileHandleStorage.put(newID,inputStream);
|
||||
// Let any listeners know that another file handle has become available.
|
||||
lock.notify();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds an existing entry in the storage mechanism.
|
||||
* @param key Reader.
|
||||
* @return a cached stream, if available. Otherwise,
|
||||
*/
|
||||
private FileInputStream findExistingEntry(final SAMReaderID key) {
|
||||
int existingHandles = getMostRecentUniquifier(key);
|
||||
|
||||
// See if any of the keys currently exist in the repository.
|
||||
for(int i = 0; i <= existingHandles; i++) {
|
||||
UniqueKey uniqueKey = new UniqueKey(key,i);
|
||||
if(fileHandleStorage.containsKey(uniqueKey))
|
||||
return fileHandleStorage.remove(uniqueKey);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the most recent uniquifier used for the given reader.
|
||||
* @param reader Reader for which to determine uniqueness.
|
||||
* @return
|
||||
*/
|
||||
private int getMostRecentUniquifier(final SAMReaderID reader) {
|
||||
if(keyCounter.containsKey(reader))
|
||||
return keyCounter.get(reader);
|
||||
else return -1;
|
||||
}
|
||||
|
||||
private UniqueKey allocateKey(final SAMReaderID reader) {
|
||||
int uniquifier = getMostRecentUniquifier(reader)+1;
|
||||
keyCounter.put(reader,uniquifier);
|
||||
return new UniqueKey(reader,uniquifier);
|
||||
}
|
||||
|
||||
private FileInputStream openInputStream(final SAMReaderID reader) {
|
||||
try {
|
||||
return new FileInputStream(reader.getSamFilePath());
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to open input file");
|
||||
}
|
||||
}
|
||||
|
||||
private void closeInputStream(final FileInputStream inputStream) {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to open input file");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually contains the file handles, purging them as they get too old.
|
||||
*/
|
||||
private class FileHandleStorage extends LinkedHashMap<UniqueKey,FileInputStream> {
|
||||
/**
|
||||
* Remove the oldest entry
|
||||
* @param entry Entry to consider removing.
|
||||
* @return True if the cache size has been exceeded. False otherwise.
|
||||
*/
|
||||
@Override
|
||||
protected boolean removeEldestEntry(Map.Entry<UniqueKey,FileInputStream> entry) {
|
||||
synchronized (lock) {
|
||||
if(size() > cacheSize) {
|
||||
keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1);
|
||||
closeInputStream(entry.getValue());
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uniquifies a key by adding a numerical uniquifier.
|
||||
*/
|
||||
private class UniqueKey {
|
||||
/**
|
||||
* The file handle's key.
|
||||
*/
|
||||
private final SAMReaderID key;
|
||||
|
||||
/**
|
||||
* A uniquifier, so that multiple of the same reader can exist in the cache.
|
||||
*/
|
||||
private final int uniqueID;
|
||||
|
||||
public UniqueKey(final SAMReaderID reader, final int uniqueID) {
|
||||
this.key = reader;
|
||||
this.uniqueID = uniqueID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(!(other instanceof UniqueKey))
|
||||
return false;
|
||||
UniqueKey otherUniqueKey = (UniqueKey)other;
|
||||
return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return key.hashCode();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.GATKBAMFileSpan;
|
|||
import net.sf.samtools.SAMFileSpan;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
|
@ -40,28 +41,25 @@ import java.util.*;
|
|||
*/
|
||||
public class FilePointer {
|
||||
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
|
||||
protected final BAMOverlap overlap;
|
||||
protected final List<GenomeLoc> locations;
|
||||
protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
|
||||
/**
|
||||
* Does this file pointer point into an unmapped region?
|
||||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
public FilePointer() {
|
||||
this((BAMOverlap)null);
|
||||
}
|
||||
|
||||
public FilePointer(final GenomeLoc location) {
|
||||
this.overlap = null;
|
||||
this.locations = Collections.singletonList(location);
|
||||
this.isRegionUnmapped = GenomeLoc.isUnmapped(location);
|
||||
}
|
||||
|
||||
public FilePointer(final BAMOverlap overlap) {
|
||||
this.overlap = overlap;
|
||||
this.locations = new ArrayList<GenomeLoc>();
|
||||
this.isRegionUnmapped = false;
|
||||
public FilePointer(final GenomeLoc... locations) {
|
||||
this.locations.addAll(Arrays.asList(locations));
|
||||
boolean foundMapped = false, foundUnmapped = false;
|
||||
for(GenomeLoc location: locations) {
|
||||
if(GenomeLoc.isUnmapped(location))
|
||||
foundUnmapped = true;
|
||||
else
|
||||
foundMapped = true;
|
||||
}
|
||||
if(foundMapped && foundUnmapped)
|
||||
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||
this.isRegionUnmapped = foundUnmapped;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -217,4 +215,20 @@ public class FilePointer {
|
|||
fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
|
||||
combined.addFileSpans(initialElement.getKey(),fileSpan);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("FilePointer:%n");
|
||||
builder.append("\tlocations = {");
|
||||
builder.append(Utils.join(";",locations));
|
||||
builder.append("}%n\tregions = %n");
|
||||
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
|
||||
builder.append(entry.getKey());
|
||||
builder.append("= {");
|
||||
builder.append(entry.getValue());
|
||||
builder.append("}");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,419 +25,58 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.AbstractBAMFileIndex;
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.BrowseableBAMIndex;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Shard intervals based on position within the BAM file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
* Handles the process of aggregating BAM intervals into individual shards.
|
||||
* TODO: The task performed by IntervalSharder is now better performed by LocusShardBalancer. Merge BAMScheduler and IntervalSharder.
|
||||
*/
|
||||
public class IntervalSharder {
|
||||
private static Logger logger = Logger.getLogger(IntervalSharder.class);
|
||||
public class IntervalSharder implements Iterator<FilePointer> {
|
||||
/**
|
||||
* The iterator actually laying out the data for BAM scheduling.
|
||||
*/
|
||||
private final PeekableIterator<FilePointer> wrappedIterator;
|
||||
|
||||
public static Iterator<FilePointer> shardIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
return new IntervalSharder.FilePointerIterator(dataSource,loci);
|
||||
/**
|
||||
* The parser, for interval manipulation.
|
||||
*/
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
public static IntervalSharder shardOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
|
||||
}
|
||||
|
||||
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
|
||||
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
|
||||
}
|
||||
|
||||
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser());
|
||||
}
|
||||
|
||||
private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {
|
||||
wrappedIterator = new PeekableIterator<FilePointer>(scheduler);
|
||||
this.parser = parser;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* A lazy-loading iterator over file pointers.
|
||||
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
||||
* @return The next file pointer to process.
|
||||
*/
|
||||
private static class FilePointerIterator implements Iterator<FilePointer> {
|
||||
final SAMDataSource dataSource;
|
||||
final GenomeLocSortedSet loci;
|
||||
final PeekableIterator<GenomeLoc> locusIterator;
|
||||
final Queue<FilePointer> cachedFilePointers = new LinkedList<FilePointer>();
|
||||
|
||||
public FilePointerIterator(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
this.dataSource = dataSource;
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return !cachedFilePointers.isEmpty();
|
||||
}
|
||||
|
||||
public FilePointer next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("FilePointerIterator iteration is complete");
|
||||
FilePointer filePointer = cachedFilePointers.remove();
|
||||
if(cachedFilePointers.isEmpty())
|
||||
advance();
|
||||
return filePointer;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a FilePointerIterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
GenomeLocSortedSet nextBatch = new GenomeLocSortedSet(loci.getGenomeLocParser());
|
||||
String contig = null;
|
||||
|
||||
// If the next section of the BAM to be processed is unmapped, handle this region separately.
|
||||
while(locusIterator.hasNext() && nextBatch.isEmpty()) {
|
||||
contig = null;
|
||||
while(locusIterator.hasNext() && (contig == null || (!GenomeLoc.isUnmapped(locusIterator.peek()) && locusIterator.peek().getContig().equals(contig)))) {
|
||||
GenomeLoc nextLocus = locusIterator.next();
|
||||
contig = nextLocus.getContig();
|
||||
nextBatch.add(nextLocus);
|
||||
}
|
||||
}
|
||||
|
||||
if(nextBatch.size() > 0) {
|
||||
cachedFilePointers.addAll(shardIntervalsOnContig(dataSource,contig,nextBatch));
|
||||
}
|
||||
}
|
||||
public FilePointer next() {
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge / split intervals based on an awareness of the structure of the BAM file.
|
||||
* @param dataSource
|
||||
* @param contig Contig against which to align the intervals. If null, create a file pointer across unmapped reads.
|
||||
* @param loci
|
||||
* @return
|
||||
*/
|
||||
private static List<FilePointer> shardIntervalsOnContig(final SAMDataSource dataSource, final String contig, final GenomeLocSortedSet loci) {
|
||||
// If the contig is null, eliminate the chopping process and build out a file pointer consisting of the unmapped region of all BAMs.
|
||||
if(contig == null) {
|
||||
FilePointer filePointer = new FilePointer(GenomeLoc.UNMAPPED);
|
||||
for(SAMReaderID id: dataSource.getReaderIDs())
|
||||
filePointer.addFileSpans(id,null);
|
||||
return Collections.singletonList(filePointer);
|
||||
}
|
||||
|
||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
FilePointer lastFilePointer = null;
|
||||
BAMOverlap lastBAMOverlap = null;
|
||||
|
||||
Map<SAMReaderID,BrowseableBAMIndex> readerToIndexMap = new HashMap<SAMReaderID,BrowseableBAMIndex>();
|
||||
IntervalSharder.BinMergingIterator binMerger = new IntervalSharder.BinMergingIterator();
|
||||
for(SAMReaderID id: dataSource.getReaderIDs()) {
|
||||
final SAMSequenceRecord referenceSequence = dataSource.getHeader(id).getSequence(contig);
|
||||
// If this contig can't be found in the reference, skip over it.
|
||||
if(referenceSequence == null && contig != null)
|
||||
continue;
|
||||
final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id);
|
||||
binMerger.addReader(id,
|
||||
index,
|
||||
referenceSequence.getSequenceIndex(),
|
||||
index.getBinsOverlapping(referenceSequence.getSequenceIndex(),1,referenceSequence.getSequenceLength()).iterator());
|
||||
// Cache the reader for later data lookup.
|
||||
readerToIndexMap.put(id,index);
|
||||
}
|
||||
|
||||
PeekableIterator<BAMOverlap> binIterator = new PeekableIterator<BAMOverlap>(binMerger);
|
||||
|
||||
for(GenomeLoc location: loci) {
|
||||
if(!location.getContig().equals(contig))
|
||||
throw new ReviewedStingException("Location outside bounds of contig");
|
||||
|
||||
if(!binIterator.hasNext())
|
||||
break;
|
||||
|
||||
int locationStart = location.getStart();
|
||||
final int locationStop = location.getStop();
|
||||
|
||||
// Advance to first bin.
|
||||
while(binIterator.peek().stop < locationStart)
|
||||
binIterator.next();
|
||||
|
||||
// Add all relevant bins to a list. If the given bin extends beyond the end of the current interval, make
|
||||
// sure the extending bin is not pruned from the list.
|
||||
List<BAMOverlap> bamOverlaps = new ArrayList<BAMOverlap>();
|
||||
while(binIterator.hasNext() && binIterator.peek().stop <= locationStop)
|
||||
bamOverlaps.add(binIterator.next());
|
||||
if(binIterator.hasNext() && binIterator.peek().start <= locationStop)
|
||||
bamOverlaps.add(binIterator.peek());
|
||||
|
||||
// Bins found; try to match bins with locations.
|
||||
Iterator<BAMOverlap> bamOverlapIterator = bamOverlaps.iterator();
|
||||
|
||||
while(locationStop >= locationStart) {
|
||||
int binStart = lastFilePointer!=null ? lastFilePointer.overlap.start : 0;
|
||||
int binStop = lastFilePointer!=null ? lastFilePointer.overlap.stop : 0;
|
||||
|
||||
while(binStop < locationStart && bamOverlapIterator.hasNext()) {
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
|
||||
filePointers.add(lastFilePointer);
|
||||
|
||||
lastBAMOverlap = bamOverlapIterator.next();
|
||||
lastFilePointer = new FilePointer(lastBAMOverlap);
|
||||
binStart = lastFilePointer.overlap.start;
|
||||
binStop = lastFilePointer.overlap.stop;
|
||||
}
|
||||
|
||||
if(locationStart < binStart) {
|
||||
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
|
||||
filePointers.add(lastFilePointer);
|
||||
lastFilePointer = null;
|
||||
lastBAMOverlap = null;
|
||||
}
|
||||
|
||||
final int regionStop = Math.min(locationStop,binStart-1);
|
||||
|
||||
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop);
|
||||
lastFilePointer = new FilePointer(subset);
|
||||
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
else if(locationStart > binStop) {
|
||||
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0) {
|
||||
filePointers.add(lastFilePointer);
|
||||
lastFilePointer = null;
|
||||
lastBAMOverlap = null;
|
||||
}
|
||||
|
||||
GenomeLoc subset = loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,locationStop);
|
||||
filePointers.add(new FilePointer(subset));
|
||||
|
||||
locationStart = locationStop + 1;
|
||||
}
|
||||
else {
|
||||
if(lastFilePointer == null)
|
||||
throw new ReviewedStingException("Illegal state: initializer failed to create cached file pointer.");
|
||||
|
||||
// The start of the region overlaps the bin. Add the overlapping subset.
|
||||
final int regionStop = Math.min(locationStop,binStop);
|
||||
lastFilePointer.addLocation(loci.getGenomeLocParser().createGenomeLoc(location.getContig(),locationStart,regionStop));
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(lastFilePointer != null && lastFilePointer.locations.size() > 0)
|
||||
filePointers.add(lastFilePointer);
|
||||
|
||||
// Lookup the locations for every file pointer in the index.
|
||||
for(SAMReaderID id: readerToIndexMap.keySet()) {
|
||||
BrowseableBAMIndex index = readerToIndexMap.get(id);
|
||||
for(FilePointer filePointer: filePointers)
|
||||
filePointer.addFileSpans(id,index.getSpanOverlapping(filePointer.overlap.getBin(id)));
|
||||
}
|
||||
|
||||
return filePointers;
|
||||
}
|
||||
|
||||
private static class BinMergingIterator implements Iterator<BAMOverlap> {
|
||||
private PriorityQueue<BinQueueState> binQueue = new PriorityQueue<BinQueueState>();
|
||||
private Queue<BAMOverlap> pendingOverlaps = new LinkedList<BAMOverlap>();
|
||||
|
||||
public void addReader(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, Iterator<Bin> bins) {
|
||||
binQueue.add(new BinQueueState(id,index,referenceSequence,new IntervalSharder.LowestLevelBinFilteringIterator(index,bins)));
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return pendingOverlaps.size() > 0 || !binQueue.isEmpty();
|
||||
}
|
||||
|
||||
public BAMOverlap next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No elements left in merging iterator");
|
||||
if(pendingOverlaps.isEmpty())
|
||||
advance();
|
||||
return pendingOverlaps.remove();
|
||||
}
|
||||
|
||||
public void advance() {
|
||||
List<ReaderBin> bins = new ArrayList<ReaderBin>();
|
||||
int boundsStart, boundsStop;
|
||||
|
||||
// Prime the pump
|
||||
if(binQueue.isEmpty())
|
||||
return;
|
||||
bins.add(getNextBin());
|
||||
boundsStart = bins.get(0).getStart();
|
||||
boundsStop = bins.get(0).getStop();
|
||||
|
||||
// Accumulate all the bins that overlap the current bin, in sorted order.
|
||||
while(!binQueue.isEmpty() && peekNextBin().getStart() <= boundsStop) {
|
||||
ReaderBin bin = getNextBin();
|
||||
bins.add(bin);
|
||||
boundsStart = Math.min(boundsStart,bin.getStart());
|
||||
boundsStop = Math.max(boundsStop,bin.getStop());
|
||||
}
|
||||
|
||||
List<Pair<Integer,Integer>> range = new ArrayList<Pair<Integer,Integer>>();
|
||||
int start = bins.get(0).getStart();
|
||||
int stop = bins.get(0).getStop();
|
||||
while(start <= boundsStop) {
|
||||
// Find the next stopping point.
|
||||
for(ReaderBin bin: bins) {
|
||||
stop = Math.min(stop,bin.getStop());
|
||||
if(start < bin.getStart())
|
||||
stop = Math.min(stop,bin.getStart()-1);
|
||||
}
|
||||
|
||||
range.add(new Pair<Integer,Integer>(start,stop));
|
||||
// If the last entry added included the last element, stop.
|
||||
if(stop >= boundsStop)
|
||||
break;
|
||||
|
||||
// Find the next start.
|
||||
start = stop + 1;
|
||||
for(ReaderBin bin: bins) {
|
||||
if(start >= bin.getStart() && start <= bin.getStop())
|
||||
break;
|
||||
else if(start < bin.getStart()) {
|
||||
start = bin.getStart();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add the next series of BAM overlaps to the window.
|
||||
for(Pair<Integer,Integer> window: range) {
|
||||
BAMOverlap bamOverlap = new BAMOverlap(window.first,window.second);
|
||||
for(ReaderBin bin: bins)
|
||||
bamOverlap.addBin(bin.id,bin.bin);
|
||||
pendingOverlaps.add(bamOverlap);
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Cannot remove from a merging iterator."); }
|
||||
|
||||
private ReaderBin peekNextBin() {
|
||||
if(binQueue.isEmpty())
|
||||
throw new NoSuchElementException("No more bins are available");
|
||||
BinQueueState current = binQueue.peek();
|
||||
return new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.peekNextBin());
|
||||
}
|
||||
|
||||
private ReaderBin getNextBin() {
|
||||
if(binQueue.isEmpty())
|
||||
throw new NoSuchElementException("No more bins are available");
|
||||
BinQueueState current = binQueue.remove();
|
||||
ReaderBin readerBin = new ReaderBin(current.getReaderID(),current.getIndex(),current.getReferenceSequence(),current.nextBin());
|
||||
if(current.hasNextBin())
|
||||
binQueue.add(current);
|
||||
return readerBin;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters out bins not at the lowest level in the tree.
|
||||
*/
|
||||
private static class LowestLevelBinFilteringIterator implements Iterator<Bin> {
|
||||
private BrowseableBAMIndex index;
|
||||
private Iterator<Bin> wrappedIterator;
|
||||
|
||||
private Bin nextBin;
|
||||
|
||||
public LowestLevelBinFilteringIterator(final BrowseableBAMIndex index, Iterator<Bin> iterator) {
|
||||
this.index = index;
|
||||
this.wrappedIterator = iterator;
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextBin != null;
|
||||
}
|
||||
|
||||
public Bin next() {
|
||||
Bin bin = nextBin;
|
||||
advance();
|
||||
return bin;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Remove operation is not supported"); }
|
||||
|
||||
private void advance() {
|
||||
nextBin = null;
|
||||
while(wrappedIterator.hasNext() && nextBin == null) {
|
||||
Bin bin = wrappedIterator.next();
|
||||
if(index.getLevelForBin(bin) == AbstractBAMFileIndex.getNumIndexLevels()-1)
|
||||
nextBin = bin;
|
||||
}
|
||||
}
|
||||
}
|
||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
||||
}
|
||||
|
||||
class BinQueueState implements Comparable<org.broadinstitute.sting.gatk.datasources.reads.BinQueueState> {
|
||||
private final SAMReaderID id;
|
||||
private final BrowseableBAMIndex index;
|
||||
private final int referenceSequence;
|
||||
private final PeekableIterator<Bin> bins;
|
||||
|
||||
private int firstLocusInCurrentBin;
|
||||
private int lastLocusInCurrentBin;
|
||||
|
||||
public BinQueueState(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Iterator<Bin> bins) {
|
||||
this.id = id;
|
||||
this.index = index;
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.bins = new PeekableIterator<Bin>(bins);
|
||||
refreshLocusInBinCache();
|
||||
}
|
||||
|
||||
public SAMReaderID getReaderID() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public BrowseableBAMIndex getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public int getReferenceSequence() {
|
||||
return referenceSequence;
|
||||
}
|
||||
|
||||
public boolean hasNextBin() {
|
||||
return bins.hasNext();
|
||||
}
|
||||
|
||||
public Bin peekNextBin() {
|
||||
return bins.peek();
|
||||
}
|
||||
|
||||
public Bin nextBin() {
|
||||
Bin nextBin = bins.next();
|
||||
refreshLocusInBinCache();
|
||||
return nextBin;
|
||||
}
|
||||
|
||||
public int compareTo(org.broadinstitute.sting.gatk.datasources.reads.BinQueueState other) {
|
||||
if(!this.bins.hasNext() && !other.bins.hasNext()) return 0;
|
||||
if(!this.bins.hasNext()) return -1;
|
||||
if(!this.bins.hasNext()) return 1;
|
||||
|
||||
// Both BinQueueStates have next bins. Before proceeding, make sure the bin cache is valid.
|
||||
if(this.firstLocusInCurrentBin <= 0 || this.lastLocusInCurrentBin <= 0 ||
|
||||
other.firstLocusInCurrentBin <= 0 || other.lastLocusInCurrentBin <= 0) {
|
||||
throw new ReviewedStingException("Sharding mechanism error - bin->locus cache is invalid.");
|
||||
}
|
||||
|
||||
// Straight integer subtraction works here because lhsStart, rhsStart always positive.
|
||||
if(this.firstLocusInCurrentBin != other.firstLocusInCurrentBin)
|
||||
return this.firstLocusInCurrentBin - other.firstLocusInCurrentBin;
|
||||
|
||||
// Straight integer subtraction works here because lhsStop, rhsStop always positive.
|
||||
return this.lastLocusInCurrentBin - other.lastLocusInCurrentBin;
|
||||
}
|
||||
|
||||
private void refreshLocusInBinCache() {
|
||||
firstLocusInCurrentBin = -1;
|
||||
lastLocusInCurrentBin = -1;
|
||||
if(bins.hasNext()) {
|
||||
Bin bin = bins.peek();
|
||||
firstLocusInCurrentBin = index.getFirstLocusInBin(bin);
|
||||
lastLocusInCurrentBin = index.getLastLocusInBin(bin);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Batch granular file pointers into potentially larger shards.
|
||||
*/
|
||||
public class LocusShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
public boolean hasNext() {
|
||||
return filePointers.hasNext();
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
FilePointer current = filePointers.next();
|
||||
while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
|
||||
current = current.combine(parser,filePointers.next());
|
||||
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -1,178 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A sharding strategy for loci based on reading of the index.
|
||||
*/
|
||||
public class LocusShardStrategy implements ShardStrategy {
|
||||
/**
|
||||
* The data source to use when performing this sharding.
|
||||
*/
|
||||
private final SAMDataSource reads;
|
||||
|
||||
/**
|
||||
* the parser for creating shards
|
||||
*/
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* An iterator through the available file pointers.
|
||||
*/
|
||||
private final Iterator<FilePointer> filePointerIterator;
|
||||
|
||||
/**
|
||||
* construct the shard strategy from a seq dictionary, a shard size, and and genomeLocs
|
||||
* @param reads Data source from which to load index data.
|
||||
* @param locations List of locations for which to load data.
|
||||
*/
|
||||
public LocusShardStrategy(SAMDataSource reads, IndexedFastaSequenceFile reference, GenomeLocParser genomeLocParser, GenomeLocSortedSet locations) {
|
||||
this.reads = reads;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
|
||||
if(!reads.isEmpty()) {
|
||||
GenomeLocSortedSet intervals;
|
||||
if(locations == null) {
|
||||
// If no locations were passed in, shard the entire BAM file.
|
||||
SAMFileHeader header = reads.getHeader();
|
||||
intervals = new GenomeLocSortedSet(genomeLocParser);
|
||||
|
||||
for(SAMSequenceRecord readsSequenceRecord: header.getSequenceDictionary().getSequences()) {
|
||||
// Check this sequence against the reference sequence dictionary.
|
||||
// TODO: Do a better job of merging reads + reference.
|
||||
SAMSequenceRecord refSequenceRecord = reference.getSequenceDictionary().getSequence(readsSequenceRecord.getSequenceName());
|
||||
if(refSequenceRecord != null) {
|
||||
final int length = Math.min(readsSequenceRecord.getSequenceLength(),refSequenceRecord.getSequenceLength());
|
||||
intervals.add(genomeLocParser.createGenomeLoc(readsSequenceRecord.getSequenceName(),1,length));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
intervals = locations;
|
||||
|
||||
if(reads.isLowMemoryShardingEnabled()) {
|
||||
/*
|
||||
Iterator<FilePointer> filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
while(filePointerIterator.hasNext())
|
||||
filePointers.add(filePointerIterator.next());
|
||||
this.filePointerIterator = filePointers.iterator();
|
||||
*/
|
||||
this.filePointerIterator = new LowMemoryIntervalSharder(this.reads,intervals);
|
||||
}
|
||||
else
|
||||
this.filePointerIterator = IntervalSharder.shardIntervals(this.reads,intervals);
|
||||
}
|
||||
else {
|
||||
final int maxShardSize = 100000;
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
if(locations == null) {
|
||||
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
|
||||
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
|
||||
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
|
||||
filePointers.add(new FilePointer(genomeLocParser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)));
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for(GenomeLoc interval: locations) {
|
||||
while(interval.size() > maxShardSize) {
|
||||
filePointers.add(new FilePointer(locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)));
|
||||
interval = locations.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
|
||||
}
|
||||
filePointers.add(new FilePointer(interval));
|
||||
}
|
||||
}
|
||||
filePointerIterator = filePointers.iterator();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true if there are additional shards
|
||||
*
|
||||
* @return false if we're done processing shards
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return filePointerIterator.hasNext();
|
||||
}
|
||||
|
||||
public long shardNumber = 0;
|
||||
|
||||
/**
|
||||
* gets the next Shard
|
||||
*
|
||||
* @return the next shard
|
||||
*/
|
||||
public LocusShard next() {
|
||||
FilePointer nextFilePointer = filePointerIterator.next();
|
||||
Map<SAMReaderID,SAMFileSpan> fileSpansBounding = nextFilePointer.fileSpans != null ? nextFilePointer.fileSpans : null;
|
||||
|
||||
/*
|
||||
System.out.printf("Shard %d: interval = {",++shardNumber);
|
||||
for(GenomeLoc locus: nextFilePointer.locations)
|
||||
System.out.printf("%s;",locus);
|
||||
System.out.printf("}; ");
|
||||
|
||||
if(fileSpansBounding == null)
|
||||
System.out.printf("no shard data%n");
|
||||
else {
|
||||
SortedMap<SAMReaderID,SAMFileSpan> sortedSpans = new TreeMap<SAMReaderID,SAMFileSpan>(fileSpansBounding);
|
||||
for(Map.Entry<SAMReaderID,SAMFileSpan> entry: sortedSpans.entrySet()) {
|
||||
System.out.printf("Shard %d:%s = {%s}%n",shardNumber,entry.getKey().samFile,entry.getValue());
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
return new LocusShard(genomeLocParser, reads,nextFilePointer.locations,fileSpansBounding);
|
||||
}
|
||||
|
||||
/** we don't support the remove command */
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("ShardStrategies don't support remove()");
|
||||
}
|
||||
|
||||
/**
|
||||
* makes the IntervalShard iterable, i.e. usable in a for loop.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Handles the process of aggregating BAM intervals into individual shards.
|
||||
*/
|
||||
public class LowMemoryIntervalSharder implements Iterator<FilePointer> {
|
||||
/**
|
||||
* The iterator actually laying out the data for BAM scheduling.
|
||||
*/
|
||||
private final PeekableIterator<FilePointer> wrappedIterator;
|
||||
|
||||
/**
|
||||
* The parser, for interval manipulation.
|
||||
*/
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
wrappedIterator = new PeekableIterator<FilePointer>(new BAMScheduler(dataSource,loci));
|
||||
parser = loci.getGenomeLocParser();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return wrappedIterator.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Accumulate shards where there's no additional cost to processing the next shard in the sequence.
|
||||
* @return The next file pointer to process.
|
||||
*/
|
||||
public FilePointer next() {
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
return current;
|
||||
}
|
||||
|
||||
public void remove() { throw new UnsupportedOperationException("Unable to remove from an interval sharder."); }
|
||||
}
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A single, monolithic shard bridging all available data.
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class MonolithicShard extends Shard {
|
||||
/**
|
||||
* Creates a new monolithic shard of the given type.
|
||||
* @param shardType Type of the shard. Must be either read or locus; cannot be intervalic.
|
||||
* @param locs Intervals that this monolithic shard should process.
|
||||
*/
|
||||
public MonolithicShard(GenomeLocParser parser, SAMDataSource readsDataSource, ShardType shardType, List<GenomeLoc> locs) {
|
||||
super(parser, shardType, locs, readsDataSource, null, false);
|
||||
if(shardType != ShardType.LOCUS && shardType != ShardType.READ)
|
||||
throw new ReviewedStingException("Invalid shard type for monolithic shard: " + shardType);
|
||||
}
|
||||
|
||||
/**
|
||||
* String representation of this shard.
|
||||
* @return "entire genome".
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return "entire genome";
|
||||
}
|
||||
}
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Create a giant shard representing all the data in the input BAM(s).
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class MonolithicShardStrategy implements ShardStrategy {
|
||||
/**
|
||||
* The single shard associated with this sharding strategy.
|
||||
*/
|
||||
private MonolithicShard shard;
|
||||
|
||||
/**
|
||||
* Create a new shard strategy for shards of the given type.
|
||||
* @param shardType The shard type.
|
||||
*/
|
||||
public MonolithicShardStrategy(final GenomeLocParser parser, final SAMDataSource readsDataSource, final Shard.ShardType shardType, final List<GenomeLoc> region) {
|
||||
shard = new MonolithicShard(parser,readsDataSource,shardType,region);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience for using in a foreach loop. Will NOT create a new, reset instance of the iterator;
|
||||
* will only return another copy of the active iterator.
|
||||
* @return A copy of this.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the monolithic shard has not yet been consumed, or false otherwise.
|
||||
* @return True if shard has been consumed, false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return shard != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the monolithic shard if it has not already been retrieved.
|
||||
* @return The monolithic shard.
|
||||
* @throws NoSuchElementException if no such data exists.
|
||||
*/
|
||||
public Shard next() {
|
||||
if(shard == null)
|
||||
throw new NoSuchElementException("Monolithic shard has already been retrived.");
|
||||
|
||||
Shard working = shard;
|
||||
shard = null;
|
||||
return working;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a shard strategy");
|
||||
}
|
||||
|
||||
/**
|
||||
* Mandated by the interface, but is unsupported in this context. Will throw an exception always.
|
||||
* @param size adjust the next size to this
|
||||
*/
|
||||
public void adjustNextShardSize( long size ) {
|
||||
throw new UnsupportedOperationException("Cannot adjust the next size of a monolithic shard; there will be no next shard.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -35,15 +35,29 @@ import java.util.Map;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class ReadShard extends Shard {
|
||||
/**
|
||||
* What is the maximum number of reads which should go into a read shard.
|
||||
*/
|
||||
public static int MAX_READS = 10000;
|
||||
|
||||
/**
|
||||
* The reads making up this shard.
|
||||
*/
|
||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(ReadShardStrategy.MAX_READS);
|
||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(MAX_READS);
|
||||
|
||||
public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> loci, boolean isUnmapped) {
|
||||
super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface
|
||||
* until we know what effect tuning this parameter has.
|
||||
* @param bufferSize New maximum number
|
||||
*/
|
||||
static void setReadBufferSize(final int bufferSize) {
|
||||
MAX_READS = bufferSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this shard is meant to buffer reads, rather
|
||||
* than just holding pointers to their locations.
|
||||
|
|
@ -66,7 +80,7 @@ public class ReadShard extends Shard {
|
|||
* @return True if this shard's buffer is full (and the shard can buffer reads).
|
||||
*/
|
||||
public boolean isBufferFull() {
|
||||
return reads.size() > ReadShardStrategy.MAX_READS;
|
||||
return reads.size() > ReadShard.MAX_READS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,127 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
*/
|
||||
public class ReadShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
*/
|
||||
private Map<SAMReaderID,GATKBAMFileSpan> position = readsDataSource.getCurrentPosition();
|
||||
|
||||
{
|
||||
if(filePointers.hasNext())
|
||||
currentFilePointer = filePointers.next();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition;
|
||||
nextShard = null;
|
||||
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id)));
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(!isEmpty(selectedReaders)) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
readsDataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointers.hasNext() ? filePointers.next() : null;
|
||||
}
|
||||
|
||||
position = readsDataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects whether the list of file spans contain any read data.
|
||||
* @param selectedSpans Mapping of readers to file spans.
|
||||
* @return True if file spans are completely empty; false otherwise.
|
||||
*/
|
||||
private boolean isEmpty(Map<SAMReaderID,SAMFileSpan> selectedSpans) {
|
||||
for(SAMFileSpan fileSpan: selectedSpans.values()) {
|
||||
if(!fileSpan.isEmpty())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The sharding strategy for reads using a simple counting mechanism. Each read shard
|
||||
* has a specific number of reads (default to 10K) which is configured in the constructor.
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 14, 2009
|
||||
*/
|
||||
public class ReadShardStrategy implements ShardStrategy {
|
||||
/**
|
||||
* What is the maximum number of reads which should go into a read shard.
|
||||
*/
|
||||
protected static final int MAX_READS = 10000;
|
||||
|
||||
/**
|
||||
* The data source used to shard.
|
||||
*/
|
||||
private final SAMDataSource dataSource;
|
||||
|
||||
/**
|
||||
* The intervals to be processed.
|
||||
*/
|
||||
private final GenomeLocSortedSet locations;
|
||||
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/** our storage of the genomic locations they'd like to shard over */
|
||||
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
|
||||
/**
|
||||
* Iterator over the list of file pointers.
|
||||
*/
|
||||
private final Iterator<FilePointer> filePointerIterator;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentFilePointer;
|
||||
|
||||
/**
|
||||
* Ending position of the last shard in the file.
|
||||
*/
|
||||
private Map<SAMReaderID,SAMFileSpan> position;
|
||||
|
||||
/**
|
||||
* An indicator whether the strategy has sharded into the unmapped region.
|
||||
*/
|
||||
private boolean isIntoUnmappedRegion = false;
|
||||
|
||||
private final GenomeLocParser parser;
|
||||
|
||||
/**
|
||||
* Create a new read shard strategy, loading read shards from the given BAM file.
|
||||
* @param dataSource Data source from which to load shards.
|
||||
* @param locations intervals to use for sharding.
|
||||
*/
|
||||
public ReadShardStrategy(GenomeLocParser parser, SAMDataSource dataSource, GenomeLocSortedSet locations) {
|
||||
this.dataSource = dataSource;
|
||||
this.parser = parser;
|
||||
this.position = this.dataSource.getCurrentPosition();
|
||||
this.locations = locations;
|
||||
|
||||
if(locations != null)
|
||||
filePointerIterator = dataSource.isLowMemoryShardingEnabled() ? new LowMemoryIntervalSharder(this.dataSource,locations) : IntervalSharder.shardIntervals(this.dataSource,locations);
|
||||
else
|
||||
filePointerIterator = filePointers.iterator();
|
||||
|
||||
if(filePointerIterator.hasNext())
|
||||
currentFilePointer = filePointerIterator.next();
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have another read shard?
|
||||
* @return True if any more data is available. False otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the next shard, if available.
|
||||
* @return The next shard, if available.
|
||||
* @throws java.util.NoSuchElementException if no such shard is available.
|
||||
*/
|
||||
public Shard next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
public void advance() {
|
||||
Map<SAMReaderID,SAMFileSpan> shardPosition = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
nextShard = null;
|
||||
|
||||
if(locations != null) {
|
||||
Map<SAMReaderID,SAMFileSpan> selectedReaders = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
while(selectedReaders.size() == 0 && currentFilePointer != null) {
|
||||
shardPosition = currentFilePointer.fileSpans;
|
||||
|
||||
for(SAMReaderID id: shardPosition.keySet()) {
|
||||
SAMFileSpan fileSpan = shardPosition.get(id).removeContentsBefore(position.get(id));
|
||||
if(!fileSpan.isEmpty())
|
||||
selectedReaders.put(id,fileSpan);
|
||||
}
|
||||
|
||||
if(selectedReaders.size() > 0) {
|
||||
Shard shard = new ReadShard(parser, dataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped);
|
||||
dataSource.fillShard(shard);
|
||||
|
||||
if(!shard.isBufferEmpty()) {
|
||||
nextShard = shard;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
selectedReaders.clear();
|
||||
currentFilePointer = filePointerIterator.hasNext() ? filePointerIterator.next() : null;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// todo -- this nulling of intervals is a bit annoying since readwalkers without
|
||||
// todo -- any -L values need to be special cased throughout the code.
|
||||
Shard shard = new ReadShard(parser,dataSource,position,null,false);
|
||||
dataSource.fillShard(shard);
|
||||
nextShard = !shard.isBufferEmpty() ? shard : null;
|
||||
}
|
||||
|
||||
this.position = dataSource.getCurrentPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Remove not supported");
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for using ShardStrategy in an foreach loop.
|
||||
* @return A iterator over shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.BrowseableBAMIndex;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Feb 2, 2011
|
||||
* Time: 4:36:40 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class ReaderBin {
|
||||
public final SAMReaderID id;
|
||||
public final BrowseableBAMIndex index;
|
||||
public final int referenceSequence;
|
||||
public final Bin bin;
|
||||
|
||||
public ReaderBin(final SAMReaderID id, final BrowseableBAMIndex index, final int referenceSequence, final Bin bin) {
|
||||
this.id = id;
|
||||
this.index = index;
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.bin = bin;
|
||||
}
|
||||
|
||||
public int getStart() {
|
||||
return index.getFirstLocusInBin(bin);
|
||||
}
|
||||
|
||||
public int getStop() {
|
||||
return index.getLastLocusInBin(bin);
|
||||
}
|
||||
}
|
||||
|
|
@ -37,8 +37,11 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -49,6 +52,7 @@ import java.io.File;
|
|||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
|
|
@ -60,6 +64,9 @@ import java.util.*;
|
|||
public class SAMDataSource {
|
||||
final private static GATKSamRecordFactory factory = new GATKSamRecordFactory();
|
||||
|
||||
/** If true, we will load SAMReaders in parallel */
|
||||
final private static boolean USE_PARALLEL_LOADING = false;
|
||||
|
||||
/** Backing support for reads. */
|
||||
protected final ReadProperties readProperties;
|
||||
|
||||
|
|
@ -71,7 +78,7 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
|
||||
*/
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
protected final GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* Identifiers for the readers driving this data source.
|
||||
|
|
@ -91,13 +98,18 @@ public class SAMDataSource {
|
|||
/**
|
||||
* How far along is each reader?
|
||||
*/
|
||||
private final Map<SAMReaderID, SAMFileSpan> readerPositions = new HashMap<SAMReaderID,SAMFileSpan>();
|
||||
private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
|
||||
|
||||
/**
|
||||
* The merged header.
|
||||
*/
|
||||
private final SAMFileHeader mergedHeader;
|
||||
|
||||
/**
|
||||
* The constituent headers of the unmerged files.
|
||||
*/
|
||||
private final Map<SAMReaderID,SAMFileHeader> headers = new HashMap<SAMReaderID,SAMFileHeader>();
|
||||
|
||||
/**
|
||||
* The sort order of the BAM files. Files without a sort order tag are assumed to be
|
||||
* in coordinate order.
|
||||
|
|
@ -131,17 +143,24 @@ public class SAMDataSource {
|
|||
private final SAMResourcePool resourcePool;
|
||||
|
||||
/**
|
||||
* Whether to enable the new low-memory sharding mechanism.
|
||||
* Asynchronously loads BGZF blocks.
|
||||
*/
|
||||
private boolean enableLowMemorySharding = false;
|
||||
private final BGZFBlockLoadingDispatcher dispatcher;
|
||||
|
||||
/**
|
||||
* How are threads allocated.
|
||||
*/
|
||||
private final ThreadAllocation threadAllocation;
|
||||
|
||||
/**
|
||||
* Create a new SAM data source given the supplied read metadata.
|
||||
* @param samFiles list of reads files.
|
||||
*/
|
||||
public SAMDataSource(Collection<SAMReaderID> samFiles,GenomeLocParser genomeLocParser) {
|
||||
public SAMDataSource(Collection<SAMReaderID> samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
|
||||
this(
|
||||
samFiles,
|
||||
threadAllocation,
|
||||
numFileHandles,
|
||||
genomeLocParser,
|
||||
false,
|
||||
SAMFileReader.ValidationStringency.STRICT,
|
||||
|
|
@ -150,8 +169,7 @@ public class SAMDataSource {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
false,
|
||||
false,
|
||||
true);
|
||||
false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -159,6 +177,8 @@ public class SAMDataSource {
|
|||
*/
|
||||
public SAMDataSource(
|
||||
Collection<SAMReaderID> samFiles,
|
||||
ThreadAllocation threadAllocation,
|
||||
Integer numFileHandles,
|
||||
GenomeLocParser genomeLocParser,
|
||||
boolean useOriginalBaseQualities,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
|
|
@ -167,9 +187,10 @@ public class SAMDataSource {
|
|||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
boolean includeReadsWithDeletionAtLoci,
|
||||
boolean generateExtendedEvents,
|
||||
boolean enableLowMemorySharding) {
|
||||
boolean generateExtendedEvents) {
|
||||
this( samFiles,
|
||||
threadAllocation,
|
||||
numFileHandles,
|
||||
genomeLocParser,
|
||||
useOriginalBaseQualities,
|
||||
strictness,
|
||||
|
|
@ -182,9 +203,8 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode.OFF,
|
||||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null, // no BAQ
|
||||
(byte) -1,
|
||||
enableLowMemorySharding);
|
||||
}
|
||||
(byte) -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new SAM data source given the supplied read metadata.
|
||||
|
|
@ -205,6 +225,8 @@ public class SAMDataSource {
|
|||
*/
|
||||
public SAMDataSource(
|
||||
Collection<SAMReaderID> samFiles,
|
||||
ThreadAllocation threadAllocation,
|
||||
Integer numFileHandles,
|
||||
GenomeLocParser genomeLocParser,
|
||||
boolean useOriginalBaseQualities,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
|
|
@ -217,28 +239,45 @@ public class SAMDataSource {
|
|||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
byte defaultBaseQualities,
|
||||
boolean enableLowMemorySharding) {
|
||||
this.enableLowMemorySharding(enableLowMemorySharding);
|
||||
byte defaultBaseQualities) {
|
||||
this.readMetrics = new ReadMetrics();
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
|
||||
readerIDs = samFiles;
|
||||
|
||||
this.threadAllocation = threadAllocation;
|
||||
// TODO: Consider a borrowed-thread dispatcher implementation.
|
||||
if(this.threadAllocation.getNumIOThreads() > 0) {
|
||||
logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads());
|
||||
dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
|
||||
}
|
||||
else
|
||||
dispatcher = null;
|
||||
|
||||
validationStringency = strictness;
|
||||
for (SAMReaderID readerID : samFiles) {
|
||||
if (!readerID.samFile.canRead())
|
||||
throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " +
|
||||
"Please check that the file is present and readable and try again.");
|
||||
if(readBufferSize != null)
|
||||
ReadShard.setReadBufferSize(readBufferSize);
|
||||
else {
|
||||
// Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively
|
||||
// will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
|
||||
ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000));
|
||||
}
|
||||
|
||||
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
// Determine the sort order.
|
||||
for(SAMFileReader reader: readers.values()) {
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
if (! readerID.samFile.canRead() )
|
||||
throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " +
|
||||
"Please check that the file is present and readable and try again.");
|
||||
|
||||
// Get the sort order, forcing it to coordinate if unsorted.
|
||||
SAMFileReader reader = readers.getReader(readerID);
|
||||
SAMFileHeader header = reader.getFileHeader();
|
||||
|
||||
headers.put(readerID,header);
|
||||
|
||||
if ( header.getReadGroups().isEmpty() ) {
|
||||
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
|
||||
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
|
||||
|
|
@ -256,16 +295,14 @@ public class SAMDataSource {
|
|||
|
||||
initializeReaderPositions(readers);
|
||||
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
|
||||
mergedHeader = headerMerger.getMergedHeader();
|
||||
hasReadGroupCollisions = headerMerger.hasReadGroupCollisions();
|
||||
mergedHeader = readers.getMergedHeader();
|
||||
hasReadGroupCollisions = readers.hasReadGroupCollisions();
|
||||
|
||||
readProperties = new ReadProperties(
|
||||
samFiles,
|
||||
mergedHeader,
|
||||
useOriginalBaseQualities,
|
||||
strictness,
|
||||
readBufferSize,
|
||||
downsamplingMethod,
|
||||
exclusionList,
|
||||
supplementalFilters,
|
||||
|
|
@ -275,7 +312,7 @@ public class SAMDataSource {
|
|||
qmode,
|
||||
refReader,
|
||||
defaultBaseQualities);
|
||||
|
||||
|
||||
// cache the read group id (original) -> read group id (merged)
|
||||
// and read group id (merged) -> read group id (original) mappings.
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
|
|
@ -284,9 +321,9 @@ public class SAMDataSource {
|
|||
|
||||
List<SAMReadGroupRecord> readGroups = reader.getFileHeader().getReadGroups();
|
||||
for(SAMReadGroupRecord readGroup: readGroups) {
|
||||
if(headerMerger.hasReadGroupCollisions()) {
|
||||
mappingToMerged.put(readGroup.getReadGroupId(),headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()));
|
||||
mergedToOriginalReadGroupMappings.put(headerMerger.getReadGroupId(reader,readGroup.getReadGroupId()),readGroup.getReadGroupId());
|
||||
if(hasReadGroupCollisions) {
|
||||
mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId()));
|
||||
mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId());
|
||||
} else {
|
||||
mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
|
||||
mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
|
||||
|
|
@ -296,12 +333,10 @@ public class SAMDataSource {
|
|||
originalToMergedReadGroupMappings.put(id,mappingToMerged);
|
||||
}
|
||||
|
||||
if(enableLowMemorySharding) {
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
File indexFile = findIndexFile(id.samFile);
|
||||
if(indexFile != null)
|
||||
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
||||
}
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
File indexFile = findIndexFile(id.samFile);
|
||||
if(indexFile != null)
|
||||
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
||||
}
|
||||
|
||||
resourcePool.releaseReaders(readers);
|
||||
|
|
@ -314,22 +349,6 @@ public class SAMDataSource {
|
|||
*/
|
||||
public ReadProperties getReadsInfo() { return readProperties; }
|
||||
|
||||
/**
|
||||
* Enable experimental low-memory sharding.
|
||||
* @param enable True to enable sharding. False otherwise.
|
||||
*/
|
||||
public void enableLowMemorySharding(final boolean enable) {
|
||||
enableLowMemorySharding = enable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether low-memory sharding is enabled.
|
||||
* @return True if enabled, false otherwise.
|
||||
*/
|
||||
public boolean isLowMemoryShardingEnabled() {
|
||||
return enableLowMemorySharding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks to see whether any reads files are supplying data.
|
||||
* @return True if no reads files are supplying data to the traversal; false otherwise.
|
||||
|
|
@ -368,7 +387,7 @@ public class SAMDataSource {
|
|||
* Retrieves the current position within the BAM file.
|
||||
* @return A mapping of reader to current position.
|
||||
*/
|
||||
public Map<SAMReaderID,SAMFileSpan> getCurrentPosition() {
|
||||
public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
|
||||
return readerPositions;
|
||||
}
|
||||
|
||||
|
|
@ -381,7 +400,7 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
public SAMFileHeader getHeader(SAMReaderID id) {
|
||||
return resourcePool.getReadersWithoutLocking().getReader(id).getFileHeader();
|
||||
return headers.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -404,45 +423,21 @@ public class SAMDataSource {
|
|||
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
|
||||
}
|
||||
|
||||
/**
|
||||
* No read group collisions at this time because only one SAM file is currently supported.
|
||||
* @return False always.
|
||||
*/
|
||||
public boolean hasReadGroupCollisions() {
|
||||
return hasReadGroupCollisions;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if all readers have an index.
|
||||
* @return True if all readers have an index.
|
||||
*/
|
||||
public boolean hasIndex() {
|
||||
if(enableLowMemorySharding)
|
||||
return readerIDs.size() == bamIndices.size();
|
||||
else {
|
||||
for(SAMFileReader reader: resourcePool.getReadersWithoutLocking()) {
|
||||
if(!reader.hasIndex())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return readerIDs.size() == bamIndices.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the index for a particular reader. Always preloaded.
|
||||
* TODO: Should return object of type GATKBAMIndex, but cannot because there
|
||||
* TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
|
||||
* TODO: sharding system goes live.
|
||||
* @param id Id of the reader.
|
||||
* @return The index. Will preload the index if necessary.
|
||||
*/
|
||||
public Object getIndex(final SAMReaderID id) {
|
||||
if(enableLowMemorySharding)
|
||||
return bamIndices.get(id);
|
||||
else {
|
||||
SAMReaders readers = resourcePool.getReadersWithoutLocking();
|
||||
return readers.getReader(id).getBrowseableIndex();
|
||||
}
|
||||
public GATKBAMIndex getIndex(final SAMReaderID id) {
|
||||
return bamIndices.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -454,7 +449,7 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets the cumulative read metrics for shards already processed.
|
||||
* Gets the cumulative read metrics for shards already processed.
|
||||
* @return Cumulative read metrics.
|
||||
*/
|
||||
public ReadMetrics getCumulativeReadMetrics() {
|
||||
|
|
@ -486,10 +481,13 @@ public class SAMDataSource {
|
|||
// Cache the most recently viewed read so that we can check whether we've reached the end of a pair.
|
||||
SAMRecord read = null;
|
||||
|
||||
Map<SAMFileReader,GATKBAMFileSpan> positionUpdates = new IdentityHashMap<SAMFileReader,GATKBAMFileSpan>();
|
||||
|
||||
CloseableIterator<SAMRecord> iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate);
|
||||
while(!shard.isBufferFull() && iterator.hasNext()) {
|
||||
read = iterator.next();
|
||||
addReadToBufferingShard(shard,getReaderID(readers,read),read);
|
||||
shard.addRead(read);
|
||||
noteFilePositionUpdate(positionUpdates,read);
|
||||
}
|
||||
|
||||
// If the reads are sorted in queryname order, ensure that all reads
|
||||
|
|
@ -499,18 +497,24 @@ public class SAMDataSource {
|
|||
SAMRecord nextRead = iterator.next();
|
||||
if(read == null || !read.getReadName().equals(nextRead.getReadName()))
|
||||
break;
|
||||
addReadToBufferingShard(shard,getReaderID(readers,nextRead),nextRead);
|
||||
shard.addRead(nextRead);
|
||||
noteFilePositionUpdate(positionUpdates,nextRead);
|
||||
}
|
||||
}
|
||||
|
||||
iterator.close();
|
||||
|
||||
// Make the updates specified by the reader.
|
||||
for(Map.Entry<SAMFileReader,GATKBAMFileSpan> positionUpdate: positionUpdates.entrySet())
|
||||
readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue());
|
||||
}
|
||||
|
||||
private void noteFilePositionUpdate(Map<SAMFileReader,GATKBAMFileSpan> positionMapping, SAMRecord read) {
|
||||
GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
|
||||
positionMapping.put(read.getFileSource().getReader(),endChunk);
|
||||
}
|
||||
|
||||
public StingSAMIterator seek(Shard shard) {
|
||||
// todo: refresh monolithic sharding implementation
|
||||
if(shard instanceof MonolithicShard)
|
||||
return seekMonolithic(shard);
|
||||
|
||||
if(shard.buffersReads()) {
|
||||
return shard.iterator();
|
||||
}
|
||||
|
|
@ -540,7 +544,7 @@ public class SAMDataSource {
|
|||
*/
|
||||
private void initializeReaderPositions(SAMReaders readers) {
|
||||
for(SAMReaderID id: getReaderIDs())
|
||||
readerPositions.put(id,readers.getReader(id).getFilePointerSpanningReads());
|
||||
readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -548,25 +552,26 @@ public class SAMDataSource {
|
|||
* @param readers Readers from which to load data.
|
||||
* @param shard The shard specifying the data limits.
|
||||
* @param enableVerification True to verify. For compatibility with old sharding strategy.
|
||||
* TODO: Collapse this flag when the two sharding systems are merged.
|
||||
* @return An iterator over the selected data.
|
||||
*/
|
||||
private StingSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
|
||||
|
||||
// Set up merging to dynamically merge together multiple BAMs.
|
||||
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
MergingSamRecordIterator mergingIterator = readers.createMergingIterator();
|
||||
|
||||
for(SAMReaderID id: getReaderIDs()) {
|
||||
CloseableIterator<SAMRecord> iterator = null;
|
||||
if(!shard.isUnmapped() && shard.getFileSpans().get(id) == null)
|
||||
continue;
|
||||
iterator = shard.getFileSpans().get(id) != null ?
|
||||
readers.getReader(id).iterator(shard.getFileSpans().get(id)) :
|
||||
readers.getReader(id).queryUnmapped();
|
||||
if(readProperties.getReadBufferSize() != null)
|
||||
iterator = new BufferingReadIterator(iterator,readProperties.getReadBufferSize());
|
||||
if(shard.getGenomeLocs() != null)
|
||||
|
||||
// TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
|
||||
// TODO: Kill this check once we've proven that the design elements are gone.
|
||||
if(shard.getFileSpans().get(id) == null)
|
||||
throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
|
||||
|
||||
if(threadAllocation.getNumIOThreads() > 0) {
|
||||
BlockInputStream inputStream = readers.getInputStream(id);
|
||||
inputStream.submitAccessPlan(new SAMReaderPosition(id,inputStream,(GATKBAMFileSpan)shard.getFileSpans().get(id)));
|
||||
}
|
||||
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
|
||||
if(shard.getGenomeLocs().size() > 0)
|
||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||
mergingIterator.addIterator(readers.getReader(id),iterator);
|
||||
}
|
||||
|
|
@ -584,45 +589,6 @@ public class SAMDataSource {
|
|||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
/**
|
||||
* A stopgap measure to handle monolithic sharding
|
||||
* @param shard the (monolithic) shard.
|
||||
* @return An iterator over the monolithic shard.
|
||||
*/
|
||||
private StingSAMIterator seekMonolithic(Shard shard) {
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
// Set up merging and filtering to dynamically merge together multiple BAMs and filter out records not in the shard set.
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,readers.headers(),true);
|
||||
MergingSamRecordIterator mergingIterator = new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
for(SAMReaderID id: getReaderIDs())
|
||||
mergingIterator.addIterator(readers.getReader(id),readers.getReader(id).iterator());
|
||||
|
||||
return applyDecoratingIterators(shard.getReadMetrics(),
|
||||
shard instanceof ReadShard,
|
||||
readProperties.useOriginalBaseQualities(),
|
||||
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(mergingIterator)),
|
||||
readProperties.getDownsamplingMethod().toFraction,
|
||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||
readProperties.getSupplementalFilters(),
|
||||
readProperties.getBAQCalculationMode(),
|
||||
readProperties.getBAQQualityMode(),
|
||||
readProperties.getRefReader(),
|
||||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this read to the given shard.
|
||||
* @param shard The shard to which to add the read.
|
||||
* @param id The id of the given reader.
|
||||
* @param read The read to add to the shard.
|
||||
*/
|
||||
private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) {
|
||||
SAMFileSpan endChunk = read.getFileSource().getFilePointer().getContentsFollowing();
|
||||
shard.addRead(read);
|
||||
readerPositions.put(id,endChunk);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter reads based on user-specified criteria.
|
||||
*
|
||||
|
|
@ -689,19 +655,6 @@ public class SAMDataSource {
|
|||
this.maxEntries = maxEntries;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dangerous internal method; retrieves any set of readers, whether in iteration or not.
|
||||
* Used to handle non-exclusive, stateless operations, such as index queries.
|
||||
* @return Any collection of SAMReaders, whether in iteration or not.
|
||||
*/
|
||||
protected SAMReaders getReadersWithoutLocking() {
|
||||
synchronized(this) {
|
||||
if(allResources.size() == 0)
|
||||
createNewResource();
|
||||
}
|
||||
return allResources.get(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Choose a set of readers from the pool to use for this query. When complete,
|
||||
* @return
|
||||
|
|
@ -748,31 +701,154 @@ public class SAMDataSource {
|
|||
* A collection of readers derived from a reads metadata structure.
|
||||
*/
|
||||
private class SAMReaders implements Iterable<SAMFileReader> {
|
||||
/**
|
||||
* Cached representation of the merged header used to generate a merging iterator.
|
||||
*/
|
||||
private final SamFileHeaderMerger headerMerger;
|
||||
|
||||
/**
|
||||
* Internal storage for a map of id -> reader.
|
||||
*/
|
||||
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
|
||||
|
||||
/**
|
||||
* The inptu streams backing
|
||||
*/
|
||||
private final Map<SAMReaderID,BlockInputStream> inputStreams = new LinkedHashMap<SAMReaderID,BlockInputStream>();
|
||||
|
||||
/**
|
||||
* Derive a new set of readers from the Reads metadata.
|
||||
* @param readerIDs reads to load.
|
||||
* @param validationStringency validation stringency.
|
||||
*/
|
||||
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
SAMFileReader reader = new SAMFileReader(readerID.samFile);
|
||||
reader.setSAMRecordFactory(factory);
|
||||
reader.enableFileSource(true);
|
||||
reader.enableIndexMemoryMapping(false);
|
||||
if(!enableLowMemorySharding)
|
||||
reader.enableIndexCaching(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
final int totalNumberOfFiles = readerIDs.size();
|
||||
int readerNumber = 1;
|
||||
final SimpleTimer timer = new SimpleTimer().start();
|
||||
|
||||
final SAMFileHeader header = reader.getFileHeader();
|
||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||
if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords " + (USE_PARALLEL_LOADING ? "in parallel" : "in serial"));
|
||||
if ( ! USE_PARALLEL_LOADING ) {
|
||||
final int tickSize = 50;
|
||||
int nExecutedTotal = 0;
|
||||
long lastTick = timer.currentTime();
|
||||
for(final SAMReaderID readerID: readerIDs) {
|
||||
final ReaderInitializer init = new ReaderInitializer(readerID).call();
|
||||
if (threadAllocation.getNumIOThreads() > 0) {
|
||||
inputStreams.put(init.readerID, init.blockInputStream); // get from initializer
|
||||
}
|
||||
|
||||
readers.put(readerID,reader);
|
||||
logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile));
|
||||
readers.put(init.readerID,init.reader);
|
||||
if ( ++nExecutedTotal % tickSize == 0) {
|
||||
double tickInSec = (timer.currentTime() - lastTick) / 1000.0;
|
||||
printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec);
|
||||
lastTick = timer.currentTime();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
final int N_THREADS = 8;
|
||||
|
||||
final ExecutorService executor = Executors.newFixedThreadPool(N_THREADS);
|
||||
final List<ReaderInitializer> inits = new ArrayList<ReaderInitializer>(totalNumberOfFiles);
|
||||
Queue<Future<ReaderInitializer>> futures = new LinkedList<Future<ReaderInitializer>>();
|
||||
for (final SAMReaderID readerID: readerIDs) {
|
||||
logger.debug("Enqueuing for initialization: " + readerID.samFile);
|
||||
final ReaderInitializer init = new ReaderInitializer(readerID);
|
||||
inits.add(init);
|
||||
futures.add(executor.submit(init));
|
||||
}
|
||||
|
||||
try {
|
||||
final int MAX_WAIT = 30 * 1000;
|
||||
final int MIN_WAIT = 1 * 1000;
|
||||
|
||||
while ( ! futures.isEmpty() ) {
|
||||
final int prevSize = futures.size();
|
||||
final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file
|
||||
final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT));
|
||||
Thread.sleep(waitTimeInMS);
|
||||
|
||||
Queue<Future<ReaderInitializer>> pending = new LinkedList<Future<ReaderInitializer>>();
|
||||
for ( final Future<ReaderInitializer> initFuture : futures ) {
|
||||
if ( initFuture.isDone() ) {
|
||||
final ReaderInitializer init = initFuture.get();
|
||||
if (threadAllocation.getNumIOThreads() > 0) {
|
||||
inputStreams.put(init.readerID, init.blockInputStream); // get from initializer
|
||||
}
|
||||
logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID));
|
||||
readers.put(init.readerID, init.reader);
|
||||
} else {
|
||||
pending.add(initFuture);
|
||||
}
|
||||
}
|
||||
|
||||
final int nExecutedTotal = totalNumberOfFiles - pending.size();
|
||||
final int nExecutedInTick = prevSize - pending.size();
|
||||
printReaderPerformance(nExecutedTotal, nExecutedInTick, totalNumberOfFiles, timer, waitTimeInMS / 1000.0);
|
||||
futures = pending;
|
||||
}
|
||||
} catch ( InterruptedException e ) {
|
||||
throw new ReviewedStingException("Interrupted SAMReader initialization", e);
|
||||
} catch ( ExecutionException e ) {
|
||||
throw new ReviewedStingException("Execution exception during SAMReader initialization", e);
|
||||
}
|
||||
|
||||
executor.shutdown();
|
||||
}
|
||||
|
||||
if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime()));
|
||||
|
||||
Collection<SAMFileHeader> headers = new LinkedList<SAMFileHeader>();
|
||||
for(SAMFileReader reader: readers.values())
|
||||
headers.add(reader.getFileHeader());
|
||||
headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true);
|
||||
}
|
||||
|
||||
final private void printReaderPerformance(final int nExecutedTotal,
|
||||
final int nExecutedInTick,
|
||||
final int totalNumberOfFiles,
|
||||
final SimpleTimer timer,
|
||||
final double tickDurationInSec) {
|
||||
final int pendingSize = totalNumberOfFiles - nExecutedTotal;
|
||||
final double totalTimeInSeconds = timer.getElapsedTime();
|
||||
final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds);
|
||||
final int nRemaining = pendingSize;
|
||||
final double estTimeToComplete = pendingSize / nTasksPerSecond;
|
||||
logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m",
|
||||
nExecutedInTick, tickDurationInSec,
|
||||
nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond,
|
||||
nRemaining, estTimeToComplete, estTimeToComplete / 60));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the header derived from the merging of these BAM files.
|
||||
* @return the merged header.
|
||||
*/
|
||||
public SAMFileHeader getMergedHeader() {
|
||||
return headerMerger.getMergedHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Do multiple read groups collide in this dataset?
|
||||
* @return True if multiple read groups collide; false otherwis.
|
||||
*/
|
||||
public boolean hasReadGroupCollisions() {
|
||||
return headerMerger.hasReadGroupCollisions();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the newly mapped read group ID for the given read group.
|
||||
* @param readerID Reader for which to discern the transformed ID.
|
||||
* @param originalReadGroupID Original read group.
|
||||
* @return Remapped read group.
|
||||
*/
|
||||
public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) {
|
||||
SAMFileHeader header = readers.get(readerID).getFileHeader();
|
||||
return headerMerger.getReadGroupId(header,originalReadGroupID);
|
||||
}
|
||||
|
||||
public MergingSamRecordIterator createMergingIterator() {
|
||||
return new MergingSamRecordIterator(headerMerger,readers.values(),true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -786,6 +862,15 @@ public class SAMDataSource {
|
|||
return readers.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the input stream backing a reader.
|
||||
* @param id The ID of the reader to retrieve.
|
||||
* @return the reader associated with the given id.
|
||||
*/
|
||||
public BlockInputStream getInputStream(final SAMReaderID id) {
|
||||
return inputStreams.get(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches for the reader id of this reader.
|
||||
* @param reader Reader for which to search.
|
||||
|
|
@ -815,24 +900,29 @@ public class SAMDataSource {
|
|||
public boolean isEmpty() {
|
||||
return readers.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets all the actual readers out of this data structure.
|
||||
* @return A collection of the readers.
|
||||
*/
|
||||
public Collection<SAMFileReader> values() {
|
||||
return readers.values();
|
||||
class ReaderInitializer implements Callable<ReaderInitializer> {
|
||||
final SAMReaderID readerID;
|
||||
BlockInputStream blockInputStream = null;
|
||||
SAMFileReader reader;
|
||||
|
||||
public ReaderInitializer(final SAMReaderID readerID) {
|
||||
this.readerID = readerID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets all the actual readers out of this data structure.
|
||||
* @return A collection of the readers.
|
||||
*/
|
||||
public Collection<SAMFileHeader> headers() {
|
||||
ArrayList<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(readers.size());
|
||||
for (SAMFileReader reader : values())
|
||||
headers.add(reader.getFileHeader());
|
||||
return headers;
|
||||
public ReaderInitializer call() {
|
||||
final File indexFile = findIndexFile(readerID.samFile);
|
||||
if (threadAllocation.getNumIOThreads() > 0) {
|
||||
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
||||
}
|
||||
else
|
||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||
reader.setSAMRecordFactory(factory);
|
||||
reader.enableFileSource(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -883,7 +973,7 @@ public class SAMDataSource {
|
|||
* Filters out reads that do not overlap the current GenomeLoc.
|
||||
* Note the custom implementation: BAM index querying returns all reads that could
|
||||
* possibly overlap the given region (and quite a few extras). In order not to drag
|
||||
* down performance, this implementation is highly customized to its task.
|
||||
* down performance, this implementation is highly customized to its task.
|
||||
*/
|
||||
private class IntervalOverlapFilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
/**
|
||||
|
|
@ -903,7 +993,7 @@ public class SAMDataSource {
|
|||
|
||||
/**
|
||||
* Custom representation of interval bounds.
|
||||
* Makes it simpler to track current position.
|
||||
* Makes it simpler to track current position.
|
||||
*/
|
||||
private int[] intervalContigIndices;
|
||||
private int[] intervalStarts;
|
||||
|
|
@ -941,7 +1031,7 @@ public class SAMDataSource {
|
|||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
advance();
|
||||
}
|
||||
|
||||
|
|
@ -1018,12 +1108,12 @@ public class SAMDataSource {
|
|||
return
|
||||
// Read ends on a later contig, or...
|
||||
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -1035,8 +1125,8 @@ public class SAMDataSource {
|
|||
return
|
||||
// Read starts on a prior contig, or...
|
||||
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1070,6 +1160,40 @@ public class SAMDataSource {
|
|||
|
||||
return indexFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
|
||||
* will be as granular as possible given our current knowledge of the best ways to split up BAM files.
|
||||
* @return An iterator that spans all reads in all BAM files.
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
|
||||
* read that has been assigned
|
||||
* @return
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a schedule for processing the initialized BAM file using the given interval list.
|
||||
* The returned schedule should be as granular as possible.
|
||||
* @param intervals The list of intervals for which to create the schedule.
|
||||
* @return A granular iterator over file pointers.
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
|
||||
if(intervals == null)
|
||||
throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ public class SAMReaderID implements Comparable {
|
|||
* @param other The other identifier.
|
||||
* @return True iff the two readers point to the same file.
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(other == null) return false;
|
||||
if(!(other instanceof SAMReaderID)) return false;
|
||||
|
|
@ -79,10 +80,20 @@ public class SAMReaderID implements Comparable {
|
|||
* Generate a hash code for this object.
|
||||
* @return A hash code, based solely on the file name at this point.
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return samFile.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Best string representation for a SAM file reader is the path of the source file.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return getSamFilePath();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Object other) {
|
||||
return this.samFile.getAbsolutePath().compareTo(((SAMReaderID)other).samFile.getAbsolutePath());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: 10/14/11
|
||||
* Time: 10:47 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
class SAMReaderPosition {
|
||||
private final SAMReaderID reader;
|
||||
private final BlockInputStream inputStream;
|
||||
|
||||
private final List<GATKChunk> positions;
|
||||
private PeekableIterator<GATKChunk> positionIterator;
|
||||
|
||||
/**
|
||||
* Stores the next block address to read, or -1 if no such block is available.
|
||||
*/
|
||||
private long nextBlockAddress;
|
||||
|
||||
|
||||
SAMReaderPosition(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) {
|
||||
this.reader = reader;
|
||||
this.inputStream = inputStream;
|
||||
|
||||
this.positions = fileSpan.getGATKChunks();
|
||||
initialize();
|
||||
}
|
||||
|
||||
public SAMReaderID getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
public BlockInputStream getInputStream() {
|
||||
return inputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the next block address to be read.
|
||||
* @return Next block address to be read.
|
||||
*/
|
||||
public long getBlockAddress() {
|
||||
return nextBlockAddress;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
initialize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the SAM reader position to its original state.
|
||||
*/
|
||||
private void initialize() {
|
||||
this.positionIterator = new PeekableIterator<GATKChunk>(positions.iterator());
|
||||
if(positionIterator.hasNext())
|
||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||
else
|
||||
nextBlockAddress = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the current position to the next block to read, given the current position in the file.
|
||||
* @param filePosition The current position within the file.
|
||||
*/
|
||||
void advancePosition(final long filePosition) {
|
||||
nextBlockAddress = filePosition;
|
||||
|
||||
// Check the current file position against the iterator; if the iterator is before the current file position,
|
||||
// draw the iterator forward. Remember when performing the check that coordinates are half-open!
|
||||
try {
|
||||
while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) {
|
||||
positionIterator.next();
|
||||
// Check to see if the iterator has more data available.
|
||||
if(positionIterator.hasNext() && filePosition < positionIterator.peek().getBlockStart()) {
|
||||
nextBlockAddress = positionIterator.peek().getBlockStart();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(Exception ex) {
|
||||
throw new ReviewedStingException("");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) {
|
||||
return (filePosition > chunk.getBlockEnd() || (filePosition == chunk.getBlockEnd() && chunk.getBlockOffsetEnd() == 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Balances maximally granular file pointers into shards of reasonable size.
|
||||
*/
|
||||
public abstract class ShardBalancer implements Iterable<Shard> {
|
||||
protected SAMDataSource readsDataSource;
|
||||
protected PeekableIterator<FilePointer> filePointers;
|
||||
protected GenomeLocParser parser;
|
||||
|
||||
public void initialize(final SAMDataSource readsDataSource, final Iterator<FilePointer> filePointers, final GenomeLocParser parser) {
|
||||
this.readsDataSource = readsDataSource;
|
||||
this.filePointers = new PeekableIterator<FilePointer>(filePointers);
|
||||
this.parser = parser;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import java.util.Iterator;
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 10, 2009
|
||||
* Time: 4:55:37 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 10, 2009
|
||||
* <p/>
|
||||
* Interface ShardStrategy
|
||||
* <p/>
|
||||
* The base interface for the sharding strategy; before we had a base abstract
|
||||
* class, but not this will be an interface to accomidate read based sharding
|
||||
*/
|
||||
public interface ShardStrategy extends Iterator<Shard>, Iterable<Shard> {
|
||||
}
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: Apr 6, 2009
|
||||
* Time: 7:09:22 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 6, 2009
|
||||
* <p/>
|
||||
* Class ShardStrategyFactory
|
||||
* <p/>
|
||||
* The Shard Strategy Factory, use this class to create and transfer shard strategies
|
||||
* between different approaches.
|
||||
*/
|
||||
public class ShardStrategyFactory {
|
||||
public enum SHATTER_STRATEGY {
|
||||
MONOLITHIC, // Put all of the available data into one shard.
|
||||
LOCUS_EXPERIMENTAL,
|
||||
READS_EXPERIMENTAL
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource File pointer to BAM.
|
||||
* @param referenceDataSource File pointer to reference.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return a shard strategy capable of dividing input data into shards.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser) {
|
||||
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, -1L);
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource File pointer to BAM.
|
||||
* @param referenceDataSource File pointer to reference.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return a shard strategy capable of dividing input data into shards.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, long limitByCount) {
|
||||
switch (strat) {
|
||||
case LOCUS_EXPERIMENTAL:
|
||||
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,null);
|
||||
case READS_EXPERIMENTAL:
|
||||
return new ReadShardStrategy(genomeLocParser,readsDataSource,null);
|
||||
default:
|
||||
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource File pointer to BAM.
|
||||
* @param referenceDataSource File pointer to reference.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return a shard strategy capable of dividing input data into shards.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst) {
|
||||
return ShardStrategyFactory.shatter(readsDataSource, referenceDataSource, strat, dic, startingSize, genomeLocParser, lst, -1l);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* get a new shatter strategy
|
||||
*
|
||||
* @param readsDataSource The reads used to shatter this file.
|
||||
* @param referenceDataSource The reference used to shatter this file.
|
||||
* @param strat what's our strategy - SHATTER_STRATEGY type
|
||||
* @param dic the seq dictionary
|
||||
* @param startingSize the starting size
|
||||
* @return A strategy for shattering this data.
|
||||
*/
|
||||
static public ShardStrategy shatter(SAMDataSource readsDataSource, IndexedFastaSequenceFile referenceDataSource, SHATTER_STRATEGY strat, SAMSequenceDictionary dic, long startingSize, GenomeLocParser genomeLocParser, GenomeLocSortedSet lst, long limitDataCount) {
|
||||
switch (strat) {
|
||||
case LOCUS_EXPERIMENTAL:
|
||||
return new LocusShardStrategy(readsDataSource,referenceDataSource,genomeLocParser,lst);
|
||||
case READS_EXPERIMENTAL:
|
||||
return new ReadShardStrategy(genomeLocParser, readsDataSource,lst);
|
||||
default:
|
||||
throw new ReviewedStingException("Strategy: " + strat + " isn't implemented");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -30,10 +30,12 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
|
@ -92,7 +94,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
// initialize reads
|
||||
List<SAMReaderID> bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser);
|
||||
SAMDataSource dataSource = new SAMDataSource(bamReaders,genomeLocParser);
|
||||
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
|
||||
|
||||
// intervals
|
||||
GenomeLocSortedSet intervalSortedSet = null;
|
||||
|
|
@ -106,7 +108,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
|
||||
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
||||
IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
|
||||
while(sharder.hasNext()) {
|
||||
FilePointer filePointer = sharder.next();
|
||||
|
||||
|
|
@ -135,7 +137,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
|
||||
|
||||
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
||||
sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet);
|
||||
while(sharder.hasNext()) {
|
||||
FilePointer filePointer = sharder.next();
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,13 @@ import net.sf.picard.reference.FastaSequenceIndex;
|
|||
import net.sf.picard.reference.FastaSequenceIndexBuilder;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.sam.CreateSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
|
@ -36,13 +43,17 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
|||
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Loads reference data from fasta file
|
||||
* Looks for fai and dict files, and tries to create them if they don't exist
|
||||
*/
|
||||
public class ReferenceDataSource {
|
||||
private IndexedFastaSequenceFile index;
|
||||
private IndexedFastaSequenceFile reference;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
|
||||
|
|
@ -173,7 +184,7 @@ public class ReferenceDataSource {
|
|||
logger.info("Treating existing index file as complete.");
|
||||
}
|
||||
|
||||
index = new CachingIndexedFastaSequenceFile(fastaFile);
|
||||
reference = new CachingIndexedFastaSequenceFile(fastaFile);
|
||||
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
|
||||
|
|
@ -192,6 +203,103 @@ public class ReferenceDataSource {
|
|||
* @return IndexedFastaSequenceFile that was created from file
|
||||
*/
|
||||
public IndexedFastaSequenceFile getReference() {
|
||||
return this.index;
|
||||
return this.reference;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an iterator for processing the entire reference.
|
||||
* @param readsDataSource the reads datasource to embed in the locus shard.
|
||||
* @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
|
||||
* @param maxShardSize The maximum shard size which can be used to create this list.
|
||||
* @return Creates a schedule for performing a traversal over the entire reference.
|
||||
*/
|
||||
public Iterable<Shard> createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) {
|
||||
List<Shard> shards = new ArrayList<Shard>();
|
||||
for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) {
|
||||
for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) {
|
||||
final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength());
|
||||
shards.add(new LocusShard(parser,
|
||||
readsDataSource,
|
||||
Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)),
|
||||
null));
|
||||
}
|
||||
}
|
||||
return shards;
|
||||
}
|
||||
|
||||
|
||||
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) {
|
||||
List<Shard> shards = new ArrayList<Shard>();
|
||||
|
||||
for(GenomeLoc interval: intervals) {
|
||||
while(interval.size() > maxShardSize) {
|
||||
shards.add(new LocusShard(intervals.getGenomeLocParser(),
|
||||
readsDataSource,
|
||||
Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)),
|
||||
null));
|
||||
interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop());
|
||||
}
|
||||
shards.add(new LocusShard(intervals.getGenomeLocParser(),
|
||||
readsDataSource,
|
||||
Collections.singletonList(interval),
|
||||
null));
|
||||
}
|
||||
|
||||
return shards;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates an iterator for processing the entire reference.
|
||||
* @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources.
|
||||
* @param intervals the list of intervals to use when processing the reference.
|
||||
* @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size.
|
||||
* @return Creates a schedule for performing a traversal over the entire reference.
|
||||
*/
|
||||
/*
|
||||
public Iterable<Shard> createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) {
|
||||
final List<Shard> shards = new ArrayList<Shard>();
|
||||
final GenomeLocParser parser = intervals.getGenomeLocParser();
|
||||
LinkedList<GenomeLoc> currentIntervals = new LinkedList<GenomeLoc>();
|
||||
|
||||
for(GenomeLoc interval: intervals) {
|
||||
// if the next interval is too big, we can safely shard currentInterval and then break down this one
|
||||
if (interval.size() > targetShardSize) {
|
||||
if (!currentIntervals.isEmpty())
|
||||
shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser));
|
||||
while(interval.size() > targetShardSize) {
|
||||
final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1);
|
||||
shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser));
|
||||
interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop());
|
||||
}
|
||||
currentIntervals = new LinkedList<GenomeLoc>();
|
||||
currentIntervals.add(interval);
|
||||
}
|
||||
// otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly)
|
||||
else {
|
||||
if (currentIntervals.isEmpty()) {
|
||||
currentIntervals.add(interval);
|
||||
}
|
||||
else {
|
||||
if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) {
|
||||
shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser));
|
||||
currentIntervals = new LinkedList<GenomeLoc>();
|
||||
}
|
||||
currentIntervals.add(interval);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!currentIntervals.isEmpty())
|
||||
shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser));
|
||||
return shards;
|
||||
}
|
||||
|
||||
private static Shard createShardFromInterval(final List<GenomeLoc> intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) {
|
||||
//logger.debug("Adding shard " + interval);
|
||||
return new LocusShard(parser,
|
||||
readsDataSource,
|
||||
intervals,
|
||||
null);
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.rmd;
|
|||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
|
|
@ -154,16 +153,6 @@ public class ReferenceOrderedDataSource {
|
|||
return (name.equals(fileDescriptor.getName()) && (type.getClass().isAssignableFrom(getType().getClass())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Seek to the specified position and return an iterator through the data.
|
||||
* @param shard Shard that points to the selected position.
|
||||
* @return Iterator through the data.
|
||||
*/
|
||||
public LocationAwareSeekableRODIterator seek( Shard shard ) {
|
||||
DataStreamSegment dataStreamSegment = shard.getGenomeLocs().size() != 0 ? new MappedStreamSegment(shard.getGenomeLocs().get(0)) : new EntireStream();
|
||||
return iteratorPool.iterator(dataStreamSegment);
|
||||
}
|
||||
|
||||
/**
|
||||
* Seek to the specified position and return an iterator through the data.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import org.broad.tribble.TribbleException;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
||||
|
|
@ -16,6 +15,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
|
|
@ -42,7 +42,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
*/
|
||||
private ThreadLocalOutputTracker outputTracker = new ThreadLocalOutputTracker();
|
||||
|
||||
private final Queue<Shard> traverseTasks = new LinkedList<Shard>();
|
||||
private final Queue<TreeReduceTask> reduceTasks = new LinkedList<TreeReduceTask>();
|
||||
|
||||
/**
|
||||
|
|
@ -50,6 +49,11 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
*/
|
||||
private Throwable error = null;
|
||||
|
||||
/**
|
||||
* Queue of incoming shards.
|
||||
*/
|
||||
private Iterator<Shard> traversalTasks;
|
||||
|
||||
/**
|
||||
* Keep a queue of shard traversals, and constantly monitor it to see what output
|
||||
* merge tasks remain.
|
||||
|
|
@ -57,9 +61,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
*/
|
||||
private final Queue<ShardTraverser> outputMergeTasks = new LinkedList<ShardTraverser>();
|
||||
|
||||
/** How many total tasks were in the queue at the start of run. */
|
||||
private int totalTraversals = 0;
|
||||
|
||||
/** How many shard traversals have run to date? */
|
||||
private int totalCompletedTraversals = 0;
|
||||
|
||||
|
|
@ -88,18 +89,16 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
|
||||
}
|
||||
|
||||
public Object execute( Walker walker, ShardStrategy shardStrategy ) {
|
||||
public Object execute( Walker walker, Iterable<Shard> shardStrategy ) {
|
||||
// Fast fail for walkers not supporting TreeReducible interface.
|
||||
if (!( walker instanceof TreeReducible ))
|
||||
throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");
|
||||
|
||||
this.traversalTasks = shardStrategy.iterator();
|
||||
|
||||
ReduceTree reduceTree = new ReduceTree(this);
|
||||
initializeWalker(walker);
|
||||
|
||||
for (Shard shard : shardStrategy)
|
||||
traverseTasks.add(shard);
|
||||
totalTraversals = traverseTasks.size();
|
||||
|
||||
while (isShardTraversePending() || isTreeReducePending()) {
|
||||
// Check for errors during execution.
|
||||
if(hasTraversalErrorOccurred())
|
||||
|
|
@ -191,7 +190,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
* @return true if a shard traversal is waiting; false otherwise.
|
||||
*/
|
||||
protected boolean isShardTraversePending() {
|
||||
return traverseTasks.size() > 0;
|
||||
return traversalTasks.hasNext();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -284,10 +283,10 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
* @param reduceTree Tree of reduces to which to add this shard traverse.
|
||||
*/
|
||||
protected void queueNextShardTraverse( Walker walker, ReduceTree reduceTree ) {
|
||||
if (traverseTasks.size() == 0)
|
||||
if (!traversalTasks.hasNext())
|
||||
throw new IllegalStateException("Cannot traverse; no pending traversals exist.");
|
||||
|
||||
Shard shard = traverseTasks.remove();
|
||||
Shard shard = traversalTasks.next();
|
||||
|
||||
// todo -- add ownership claim here
|
||||
|
||||
|
|
@ -399,16 +398,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
public int getTotalNumberOfShards() {
|
||||
return totalTraversals;
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
public int getRemainingNumberOfShards() {
|
||||
return traverseTasks.size();
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
public int getNumberOfTasksInReduceQueue() {
|
||||
return reduceTasks.size();
|
||||
|
|
|
|||
|
|
@ -17,18 +17,6 @@ package org.broadinstitute.sting.gatk.executive;
|
|||
* microscheduler is behaving.
|
||||
*/
|
||||
public interface HierarchicalMicroSchedulerMBean extends MicroSchedulerMBean {
|
||||
/**
|
||||
* What is the total number of shards assigned to this microscheduler?
|
||||
* @return Total number of shards to process.
|
||||
*/
|
||||
public int getTotalNumberOfShards();
|
||||
|
||||
/**
|
||||
* How many shards are remaining for this microscheduler to process?
|
||||
* @return Remaining number of shards to process.
|
||||
*/
|
||||
public int getRemainingNumberOfShards();
|
||||
|
||||
/**
|
||||
* How many tree reduces are waiting in the tree reduce queue?
|
||||
* @return Total number of reduces waiting in the tree reduce queue?
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
|
|||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
|
|
@ -44,7 +43,7 @@ public class LinearMicroScheduler extends MicroScheduler {
|
|||
* @param walker Computation to perform over dataset.
|
||||
* @param shardStrategy A strategy for sharding the data.
|
||||
*/
|
||||
public Object execute(Walker walker, ShardStrategy shardStrategy) {
|
||||
public Object execute(Walker walker, Iterable<Shard> shardStrategy) {
|
||||
walker.initialize();
|
||||
Accumulator accumulator = Accumulator.create(engine,walker);
|
||||
|
||||
|
|
|
|||
|
|
@ -30,11 +30,11 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.iterators.NullSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -87,20 +87,20 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
* @param reads the informations associated with the reads
|
||||
* @param reference the reference file
|
||||
* @param rods the rods to include in the traversal
|
||||
* @param nThreadsToUse Number of threads to utilize.
|
||||
* @param threadAllocation Number of threads to utilize.
|
||||
*
|
||||
* @return The best-fit microscheduler.
|
||||
*/
|
||||
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, int nThreadsToUse) {
|
||||
if (walker instanceof TreeReducible && nThreadsToUse > 1) {
|
||||
public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection<ReferenceOrderedDataSource> rods, ThreadAllocation threadAllocation) {
|
||||
if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
|
||||
if(walker.isReduceByInterval())
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
if(walker instanceof ReadWalker)
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",nThreadsToUse));
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, nThreadsToUse);
|
||||
logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
|
||||
} else {
|
||||
if(nThreadsToUse > 1)
|
||||
if(threadAllocation.getNumCPUThreads() > 1)
|
||||
throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
|
||||
return new LinearMicroScheduler(engine, walker, reads, reference, rods);
|
||||
}
|
||||
|
|
@ -156,7 +156,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
*
|
||||
* @return the return type of the walker
|
||||
*/
|
||||
public abstract Object execute(Walker walker, ShardStrategy shardStrategy);
|
||||
public abstract Object execute(Walker walker, Iterable<Shard> shardStrategy);
|
||||
|
||||
/**
|
||||
* Retrieves the object responsible for tracking and managing output.
|
||||
|
|
|
|||
|
|
@ -50,19 +50,20 @@ public class MalformedReadFilter extends ReadFilter {
|
|||
|
||||
public boolean filterOut(SAMRecord read) {
|
||||
// slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided
|
||||
if (!checkMismatchingBasesAndQuals(read)) {
|
||||
if (!filterMismatchingBaseAndQuals)
|
||||
throw new UserException.MalformedBAM(read, "BAM file has a read with mismatching number of bases and base qualities. Offender: " + read.getReadName() +" [" + read.getReadLength() + " bases] [" +read.getBaseQualities().length +"] quals");
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
return !checkInvalidAlignmentStart(read) ||
|
||||
!checkInvalidAlignmentEnd(read) ||
|
||||
!checkAlignmentDisagreesWithHeader(this.header,read) ||
|
||||
!checkHasReadGroup(read) ||
|
||||
!checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) ||
|
||||
!checkCigarDisagreesWithAlignment(read);
|
||||
}
|
||||
|
||||
private static boolean checkHasReadGroup(SAMRecord read) {
|
||||
if ( read.getReadGroup() == null )
|
||||
throw new UserException.ReadMissingReadGroup(read);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for the case in which the alignment start is inconsistent with the read unmapped flag.
|
||||
* @param read The read to validate.
|
||||
|
|
@ -127,7 +128,15 @@ public class MalformedReadFilter extends ReadFilter {
|
|||
* @param read the read to validate
|
||||
* @return true if they have the same number. False otherwise.
|
||||
*/
|
||||
private static boolean checkMismatchingBasesAndQuals(SAMRecord read) {
|
||||
return (read.getReadLength() == read.getBaseQualities().length);
|
||||
private static boolean checkMismatchingBasesAndQuals(SAMRecord read, boolean filterMismatchingBaseAndQuals) {
|
||||
boolean result;
|
||||
if (read.getReadLength() == read.getBaseQualities().length)
|
||||
result = true;
|
||||
else if (filterMismatchingBaseAndQuals)
|
||||
result = false;
|
||||
else
|
||||
throw new UserException.MalformedBAM(read, String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals]", read.getReadName(), read.getReadLength(), read.getBaseQualities().length));
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Queue;
|
||||
|
||||
/**
|
||||
* Buffers access to a large stream of reads, replenishing the buffer only when the reads
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BufferingReadIterator implements CloseableIterator<SAMRecord> {
|
||||
private final CloseableIterator<SAMRecord> wrappedIterator;
|
||||
private final Queue<SAMRecord> buffer;
|
||||
private final int bufferSize;
|
||||
|
||||
public BufferingReadIterator(final CloseableIterator<SAMRecord> readIterator, final int bufferSize) {
|
||||
this.wrappedIterator = readIterator;
|
||||
this.buffer = new LinkedList<SAMRecord>();
|
||||
this.bufferSize = bufferSize;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
assureBufferFull();
|
||||
return !buffer.isEmpty();
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
assureBufferFull();
|
||||
if(!hasNext()) throw new NoSuchElementException("No next element available");
|
||||
return buffer.remove();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
wrappedIterator.close();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new ReviewedStingException("Unable to remove from a BufferingReadIterator");
|
||||
}
|
||||
|
||||
/**
|
||||
* If the buffer is empty but there are more elements in the iterator,
|
||||
*/
|
||||
private void assureBufferFull() {
|
||||
if(!buffer.isEmpty())
|
||||
return;
|
||||
while(buffer.size() < bufferSize && wrappedIterator.hasNext())
|
||||
buffer.add(wrappedIterator.next());
|
||||
}
|
||||
}
|
||||
|
|
@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -432,7 +431,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
while(iterator.hasNext()) {
|
||||
SAMRecordState state = iterator.next();
|
||||
if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) {
|
||||
if ( filterBaseInRead(state.getRead(), location.getStart()) ) {
|
||||
if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) {
|
||||
//discarded_bases++;
|
||||
//printStatus("Adaptor bases", discarded_adaptor_bases);
|
||||
continue;
|
||||
|
|
@ -481,8 +480,8 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
* @param pos
|
||||
* @return
|
||||
*/
|
||||
private static boolean filterBaseInRead(SAMRecord rec, long pos) {
|
||||
return ReadUtils.readPairBaseOverlapType(rec, pos) == ReadUtils.OverlapType.IN_ADAPTOR;
|
||||
private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
|
||||
return ReadUtils.isBaseInsideAdaptor(rec, pos);
|
||||
}
|
||||
|
||||
private void updateReadStates() {
|
||||
|
|
|
|||
|
|
@ -6,10 +6,9 @@ import org.broad.tribble.annotation.Strand;
|
|||
import org.broad.tribble.dbsnp.OldDbSNPFeature;
|
||||
import org.broad.tribble.gelitext.GeliTextFeature;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -187,30 +186,23 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||
|
||||
int index = dbsnp.getStart() - ref.getWindow().getStart() - 1;
|
||||
if ( index < 0 )
|
||||
return null; // we weren't given enough reference context to create the VariantContext
|
||||
Byte refBaseForIndel = new Byte(ref.getBases()[index]);
|
||||
|
||||
Map<String, Genotype> genotypes = null;
|
||||
VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel);
|
||||
return vc;
|
||||
final VariantContextBuilder builder = new VariantContextBuilder();
|
||||
builder.source(name).id(dbsnp.getRsID());
|
||||
builder.loc(dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0));
|
||||
builder.alleles(alleles);
|
||||
builder.referenceBaseForIndel(refBaseForIndel);
|
||||
return builder.make();
|
||||
} else
|
||||
return null; // can't handle anything else
|
||||
}
|
||||
}
|
||||
|
||||
public static VCFHeader createVCFHeader(Set<VCFHeaderLine> hInfo, VariantContext vc) {
|
||||
HashSet<String> names = new LinkedHashSet<String>();
|
||||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
||||
names.add(g.getSampleName());
|
||||
}
|
||||
|
||||
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GELI to VariantContext
|
||||
|
|
@ -257,20 +249,15 @@ public class VariantContextAdaptors {
|
|||
else genotypeAlleles.add(refAllele);
|
||||
}
|
||||
|
||||
Map<String, String> attributes = new HashMap<String, String>();
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
MutableGenotype call = new MutableGenotype(name, genotypeAlleles);
|
||||
|
||||
// set the likelihoods, depth, and RMS mapping quality values
|
||||
//call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY,geli.getLikelihoods());
|
||||
//call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY,geli.getMaximumMappingQual());
|
||||
//call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY,geli.getDepthOfCoverage());
|
||||
Genotype call = new Genotype(name, genotypeAlleles);
|
||||
|
||||
// add the call to the genotype list, and then use this list to create a VariantContext
|
||||
genotypes.add(call);
|
||||
alleles.add(refAllele);
|
||||
VariantContext vc = VariantContextUtils.toVC(name, ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()), alleles, genotypes, geli.getLODBestToReference(), null, attributes);
|
||||
return vc;
|
||||
GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart());
|
||||
return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make();
|
||||
} else
|
||||
return null; // can't handle anything else
|
||||
}
|
||||
|
|
@ -329,7 +316,7 @@ public class VariantContextAdaptors {
|
|||
String[] samples = hapmap.getSampleIDs();
|
||||
String[] genotypeStrings = hapmap.getGenotypes();
|
||||
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(samples.length);
|
||||
GenotypesContext genotypes = GenotypesContext.create(samples.length);
|
||||
for ( int i = 0; i < samples.length; i++ ) {
|
||||
// ignore bad genotypes
|
||||
if ( genotypeStrings[i].contains("N") )
|
||||
|
|
@ -358,16 +345,13 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
|
||||
Genotype g = new Genotype(samples[i], myAlleles);
|
||||
genotypes.put(samples[i], g);
|
||||
genotypes.add(g);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attrs = new HashMap<String, Object>(1);
|
||||
attrs.put(VariantContext.ID_KEY, hapmap.getName());
|
||||
|
||||
long end = hapmap.getEnd();
|
||||
if ( deletionLength > 0 )
|
||||
end += deletionLength;
|
||||
VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs, refBaseForIndel);
|
||||
VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).referenceBaseForIndel(refBaseForIndel).make();
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,16 +30,12 @@ import org.broad.tribble.Feature;
|
|||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.NameAwareCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.help.GATKDocUtils;
|
||||
import org.broadinstitute.sting.utils.help.HelpUtils;
|
||||
|
||||
import javax.mail.Header;
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -159,10 +155,8 @@ public class FeatureManager {
|
|||
public FeatureDescriptor getByFiletype(File file) {
|
||||
List<FeatureDescriptor> canParse = new ArrayList<FeatureDescriptor>();
|
||||
for ( FeatureDescriptor descriptor : featureDescriptors )
|
||||
if ( descriptor.getCodec() instanceof SelfScopingFeatureCodec ) {
|
||||
if ( ((SelfScopingFeatureCodec) descriptor.getCodec()).canDecode(file) ) {
|
||||
canParse.add(descriptor);
|
||||
}
|
||||
if ( descriptor.getCodec().canDecode(file.getPath()) ) {
|
||||
canParse.add(descriptor);
|
||||
}
|
||||
|
||||
if ( canParse.size() == 0 )
|
||||
|
|
|
|||
|
|
@ -1,28 +1,34 @@
|
|||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
import java.util.TreeMap;
|
||||
import org.apache.commons.lang.math.NumberUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Holds values for a column in a GATK report table
|
||||
*/
|
||||
public class GATKReportColumn extends TreeMap<Object, Object> {
|
||||
private String columnName;
|
||||
private Object defaultValue;
|
||||
private boolean display;
|
||||
final private String columnName;
|
||||
final private Object defaultValue;
|
||||
final private String format;
|
||||
final private boolean display;
|
||||
|
||||
/**
|
||||
* Construct the column object, specifying the column name, default value, and whether or not the column should be displayed
|
||||
*
|
||||
* @param columnName the name of the column
|
||||
* @param defaultValue the default value of the column
|
||||
* @param display if true, the column will be displayed in the final output
|
||||
* @param display if true, the column will be displayed in the final output
|
||||
* @param format format string
|
||||
*/
|
||||
public GATKReportColumn(String columnName, Object defaultValue, boolean display) {
|
||||
public GATKReportColumn(String columnName, Object defaultValue, boolean display, String format) {
|
||||
this.columnName = columnName;
|
||||
this.defaultValue = defaultValue;
|
||||
this.display = display;
|
||||
this.format = format == null ? null : (format.equals("") ? null : format);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Initialize an element in the column with a default value
|
||||
*
|
||||
|
|
@ -55,7 +61,7 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
* @return the string value at the specified position in the column, or the default value if the element is not set
|
||||
*/
|
||||
public String getStringValue(Object primaryKey) {
|
||||
return toString(getWithoutSideEffects(primaryKey));
|
||||
return formatValue(getWithoutSideEffects(primaryKey));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -70,22 +76,47 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
|
||||
/**
|
||||
* Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed width.
|
||||
* @return the width of this column
|
||||
* @return the format string for this column
|
||||
*/
|
||||
public int getColumnWidth() {
|
||||
public GATKReportColumnFormat getColumnFormat() {
|
||||
int maxWidth = columnName.length();
|
||||
GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT;
|
||||
|
||||
for (Object obj : this.values()) {
|
||||
if (obj != null) {
|
||||
int width = toString(obj).length();
|
||||
String formatted = formatValue(obj);
|
||||
|
||||
int width = formatted.length();
|
||||
if (width > maxWidth) {
|
||||
maxWidth = width;
|
||||
}
|
||||
|
||||
if (alignment == GATKReportColumnFormat.Alignment.RIGHT) {
|
||||
if (!isRightAlign(formatted)) {
|
||||
alignment = GATKReportColumnFormat.Alignment.LEFT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return maxWidth;
|
||||
return new GATKReportColumnFormat(maxWidth, alignment);
|
||||
}
|
||||
|
||||
private static final Collection<String> RIGHT_ALIGN_STRINGS = Arrays.asList(
|
||||
"null",
|
||||
"NA",
|
||||
String.valueOf(Double.POSITIVE_INFINITY),
|
||||
String.valueOf(Double.NEGATIVE_INFINITY),
|
||||
String.valueOf(Double.NaN));
|
||||
|
||||
/**
|
||||
* Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes
|
||||
* the spaces mean that the value is already padded.
|
||||
* @param value to check
|
||||
* @return true if the value is a right alignable
|
||||
*/
|
||||
protected static boolean isRightAlign(String value) {
|
||||
return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -93,10 +124,12 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
* @param obj The object to convert to a string
|
||||
* @return The string representation of the column
|
||||
*/
|
||||
private static String toString(Object obj) {
|
||||
private String formatValue(Object obj) {
|
||||
String value;
|
||||
if (obj == null) {
|
||||
value = "null";
|
||||
} else if ( format != null ) {
|
||||
value = String.format(format, obj);
|
||||
} else if (obj instanceof Float) {
|
||||
value = String.format("%.8f", (Float) obj);
|
||||
} else if (obj instanceof Double) {
|
||||
|
|
|
|||
|
|
@ -22,27 +22,41 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.io.File;
|
||||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
/**
|
||||
* An interface marking that a given Tribble codec can look at the file and determine whether the
|
||||
* codec specifically parsing the contents of the file.
|
||||
* Column width and left/right alignment.
|
||||
*/
|
||||
public interface SelfScopingFeatureCodec {
|
||||
/**
|
||||
* This function returns true iff the File potentialInput can be parsed by this
|
||||
* codec.
|
||||
*
|
||||
* The GATK assumes that there's never a situation where two SelfScopingFeaetureCodecs
|
||||
* return true for the same file. If this occurs the GATK splits out an error.
|
||||
*
|
||||
* Note this function must never throw an error. All errors should be trapped
|
||||
* and false returned.
|
||||
*
|
||||
* @param potentialInput the file to test for parsiability with this codec
|
||||
* @return true if potentialInput can be parsed, false otherwise
|
||||
*/
|
||||
public boolean canDecode(final File potentialInput);
|
||||
public class GATKReportColumnFormat {
|
||||
public static enum Alignment { LEFT, RIGHT }
|
||||
public int width;
|
||||
public Alignment alignment;
|
||||
|
||||
public GATKReportColumnFormat(int width, Alignment alignment) {
|
||||
this.width = width;
|
||||
this.alignment = alignment;
|
||||
}
|
||||
|
||||
public int getWidth() {
|
||||
return width;
|
||||
}
|
||||
|
||||
public Alignment getAlignment() {
|
||||
return alignment;
|
||||
}
|
||||
|
||||
public String getNameFormat() {
|
||||
return "%-" + width + "s";
|
||||
}
|
||||
|
||||
public String getValueFormat() {
|
||||
switch (alignment) {
|
||||
case LEFT:
|
||||
return "%-" + width + "s";
|
||||
case RIGHT:
|
||||
return "%" + width + "s";
|
||||
default:
|
||||
throw new UnsupportedOperationException("Unknown alignment: " + alignment);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -250,13 +250,12 @@ public class GATKReportTable {
|
|||
* @param defaultValue the default value for the column
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue) {
|
||||
if (!isValidName(columnName)) {
|
||||
throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed.");
|
||||
}
|
||||
|
||||
addColumn(columnName, defaultValue, true);
|
||||
addColumn(columnName, defaultValue, null);
|
||||
}
|
||||
|
||||
public void addColumn(String columnName, Object defaultValue, String format) {
|
||||
addColumn(columnName, defaultValue, true, format);
|
||||
}
|
||||
/**
|
||||
* Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file.
|
||||
*
|
||||
|
|
@ -265,7 +264,14 @@ public class GATKReportTable {
|
|||
* @param display if true - the column will be displayed; if false - the column will be hidden
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue, boolean display) {
|
||||
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display));
|
||||
addColumn(columnName, defaultValue, display, null);
|
||||
}
|
||||
|
||||
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {
|
||||
if (!isValidName(columnName)) {
|
||||
throw new ReviewedStingException("Attempted to set a GATKReportTable column name of '" + columnName + "'. GATKReportTable column names must be purely alphanumeric - no spaces or special characters are allowed.");
|
||||
}
|
||||
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -602,12 +608,9 @@ public class GATKReportTable {
|
|||
*/
|
||||
public void write(PrintStream out) {
|
||||
// Get the column widths for everything
|
||||
HashMap<String, String> columnWidths = new HashMap<String, String>();
|
||||
HashMap<String, GATKReportColumnFormat> columnFormats = new HashMap<String, GATKReportColumnFormat>();
|
||||
for (String columnName : columns.keySet()) {
|
||||
int width = columns.get(columnName).getColumnWidth();
|
||||
String format = "%-" + String.valueOf(width) + "s";
|
||||
|
||||
columnWidths.put(columnName, format);
|
||||
columnFormats.put(columnName, columns.get(columnName).getColumnFormat());
|
||||
}
|
||||
String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s";
|
||||
|
||||
|
|
@ -624,7 +627,7 @@ public class GATKReportTable {
|
|||
for (String columnName : columns.keySet()) {
|
||||
if (columns.get(columnName).isDisplayable()) {
|
||||
if (needsPadding) { out.printf(" "); }
|
||||
out.printf(columnWidths.get(columnName), columnName);
|
||||
out.printf(columnFormats.get(columnName).getNameFormat(), columnName);
|
||||
|
||||
needsPadding = true;
|
||||
}
|
||||
|
|
@ -644,7 +647,7 @@ public class GATKReportTable {
|
|||
if (columns.get(columnName).isDisplayable()) {
|
||||
if (needsPadding) { out.printf(" "); }
|
||||
String value = columns.get(columnName).getStringValue(primaryKey);
|
||||
out.printf(columnWidths.get(columnName), value);
|
||||
out.printf(columnFormats.get(columnName).getValueFormat(), value);
|
||||
|
||||
needsPadding = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.resourcemanagement;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Models how threads are distributed between various components of the GATK.
|
||||
*/
|
||||
public class ThreadAllocation {
|
||||
/**
|
||||
* The number of CPU threads to be used by the GATK.
|
||||
*/
|
||||
private final int numCPUThreads;
|
||||
|
||||
/**
|
||||
* Number of threads to devote exclusively to IO. Default is 0.
|
||||
*/
|
||||
private final int numIOThreads;
|
||||
|
||||
public int getNumCPUThreads() {
|
||||
return numCPUThreads;
|
||||
}
|
||||
|
||||
public int getNumIOThreads() {
|
||||
return numIOThreads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct the default thread allocation.
|
||||
*/
|
||||
public ThreadAllocation() {
|
||||
this(1,null,null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads.
|
||||
* (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread).
|
||||
* @param totalThreads Complete number of threads to allocate.
|
||||
* @param numCPUThreads Total number of threads allocated to the traversal.
|
||||
* @param numIOThreads Total number of threads allocated exclusively to IO.
|
||||
*/
|
||||
public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) {
|
||||
// If no allocation information is present, allocate all threads to CPU
|
||||
if(numCPUThreads == null && numIOThreads == null) {
|
||||
this.numCPUThreads = totalThreads;
|
||||
this.numIOThreads = 0;
|
||||
}
|
||||
// If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads).
|
||||
else if(numIOThreads == null) {
|
||||
if(numCPUThreads > totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads));
|
||||
this.numCPUThreads = numCPUThreads;
|
||||
this.numIOThreads = totalThreads - numCPUThreads;
|
||||
}
|
||||
// If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread).
|
||||
else if(numCPUThreads == null) {
|
||||
if(numIOThreads > totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads));
|
||||
this.numCPUThreads = Math.max(1,totalThreads-numIOThreads);
|
||||
this.numIOThreads = numIOThreads;
|
||||
}
|
||||
else {
|
||||
if(numCPUThreads + numIOThreads != totalThreads)
|
||||
throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads));
|
||||
this.numCPUThreads = numCPUThreads;
|
||||
this.numIOThreads = numIOThreads;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples;
|
|||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -110,6 +111,17 @@ public class Sample implements Comparable<Sample> { // implements java.io.Serial
|
|||
return infoDB.getSample(paternalID);
|
||||
}
|
||||
|
||||
public ArrayList<Sample> getParents(){
|
||||
ArrayList<Sample> parents = new ArrayList<Sample>(2);
|
||||
Sample parent = getMother();
|
||||
if(parent != null)
|
||||
parents.add(parent);
|
||||
parent = getFather();
|
||||
if(parent != null)
|
||||
parents.add(parent);
|
||||
return parents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get gender of the sample
|
||||
* @return property of key "gender" - must be of type Gender
|
||||
|
|
|
|||
|
|
@ -142,20 +142,75 @@ public class SampleDB {
|
|||
* @return
|
||||
*/
|
||||
public final Map<String, Set<Sample>> getFamilies() {
|
||||
return getFamilies(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a map from family ID -> set of family members for all samples in sampleIds with
|
||||
* non-null family ids
|
||||
*
|
||||
* @param sampleIds - all samples to include. If null is passed then all samples are returned.
|
||||
* @return
|
||||
*/
|
||||
public final Map<String, Set<Sample>> getFamilies(Collection<String> sampleIds) {
|
||||
final Map<String, Set<Sample>> families = new TreeMap<String, Set<Sample>>();
|
||||
|
||||
for ( final Sample sample : samples.values() ) {
|
||||
final String famID = sample.getFamilyID();
|
||||
if ( famID != null ) {
|
||||
if ( ! families.containsKey(famID) )
|
||||
families.put(famID, new TreeSet<Sample>());
|
||||
families.get(famID).add(sample);
|
||||
if(sampleIds == null || sampleIds.contains(sample.getID())){
|
||||
final String famID = sample.getFamilyID();
|
||||
if ( famID != null ) {
|
||||
if ( ! families.containsKey(famID) )
|
||||
families.put(famID, new TreeSet<Sample>());
|
||||
families.get(famID).add(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return families;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the set of all children that have both of their parents.
|
||||
* Note that if a family is composed of more than 1 child, each child is
|
||||
* returned.
|
||||
* @return - all the children that have both of their parents
|
||||
*/
|
||||
public final Set<Sample> getChildrenWithParents(){
|
||||
return getChildrenWithParents(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of all children that have both of their parents.
|
||||
* Note that if triosOnly = false, a family is composed of more than 1 child, each child is
|
||||
* returned.
|
||||
*
|
||||
* This method can be used wherever trios are needed
|
||||
*
|
||||
* @param triosOnly - if set to true, only strict trios are returned
|
||||
* @return - all the children that have both of their parents
|
||||
*/
|
||||
public final Set<Sample> getChildrenWithParents(boolean triosOnly) {
|
||||
|
||||
Map<String, Set<Sample>> families = getFamilies();
|
||||
final Set<Sample> childrenWithParents = new HashSet<Sample>();
|
||||
Iterator<Sample> sampleIterator;
|
||||
|
||||
for ( Set<Sample> familyMembers: families.values() ) {
|
||||
if(triosOnly && familyMembers.size() != 3)
|
||||
continue;
|
||||
|
||||
sampleIterator = familyMembers.iterator();
|
||||
Sample sample;
|
||||
while(sampleIterator.hasNext()){
|
||||
sample = sampleIterator.next();
|
||||
if(sample.getParents().size() == 2 && familyMembers.containsAll(sample.getParents()))
|
||||
childrenWithParents.add(sample);
|
||||
}
|
||||
|
||||
}
|
||||
return childrenWithParents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all samples with a given family ID
|
||||
* @param familyId
|
||||
|
|
|
|||
|
|
@ -121,7 +121,7 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
|
|||
private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000;
|
||||
private int printProgressCheckCounter = 0;
|
||||
private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
|
||||
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds
|
||||
private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds
|
||||
private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
||||
private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0;
|
||||
private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0;
|
||||
|
|
|
|||
|
|
@ -38,12 +38,11 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
|||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipreads.ClippingOp;
|
||||
import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingOp;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -299,9 +298,8 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithD
|
|||
*/
|
||||
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
|
||||
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) {
|
||||
read = ReadUtils.replaceSoftClipsWithMatches(read);
|
||||
}
|
||||
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES )
|
||||
read = ReadClipper.revertSoftClippedBases(read);
|
||||
ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip);
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ public class SplitSamFileWalker extends ReadWalker<SAMRecord, Map<String, SAMFil
|
|||
for ( SAMReadGroupRecord readGroup : this.getToolkit().getSAMFileHeader().getReadGroups()) {
|
||||
final String sample = readGroup.getSample();
|
||||
if ( ! headers.containsKey(sample) ) {
|
||||
SAMFileHeader header = ReadUtils.copySAMFileHeader(this.getToolkit().getSAMFileHeader());
|
||||
SAMFileHeader header = duplicateSAMFileHeader(this.getToolkit().getSAMFileHeader());
|
||||
logger.debug(String.format("Creating BAM header for sample %s", sample));
|
||||
ArrayList<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
|
||||
header.setReadGroups(readGroups);
|
||||
|
|
@ -121,4 +121,20 @@ public class SplitSamFileWalker extends ReadWalker<SAMRecord, Map<String, SAMFil
|
|||
|
||||
return outputs;
|
||||
}
|
||||
|
||||
public static SAMFileHeader duplicateSAMFileHeader(SAMFileHeader toCopy) {
|
||||
SAMFileHeader copy = new SAMFileHeader();
|
||||
|
||||
copy.setSortOrder(toCopy.getSortOrder());
|
||||
copy.setGroupOrder(toCopy.getGroupOrder());
|
||||
copy.setProgramRecords(toCopy.getProgramRecords());
|
||||
copy.setReadGroups(toCopy.getReadGroups());
|
||||
copy.setSequenceDictionary(toCopy.getSequenceDictionary());
|
||||
|
||||
for (Map.Entry<String, String> e : toCopy.getAttributes())
|
||||
copy.setAttribute(e.getKey(), e.getValue());
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -88,7 +88,7 @@ public abstract class Walker<MapType, ReduceType> {
|
|||
return getToolkit().getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
protected SampleDB getSampleDB() {
|
||||
public SampleDB getSampleDB() {
|
||||
return getToolkit().getSampleDB();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -54,22 +55,22 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
|||
|
||||
if ( !vc.isBiallelic() )
|
||||
return null;
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( !vc.hasGenotypes() )
|
||||
return null;
|
||||
|
||||
double ratio = 0.0;
|
||||
double totalWeights = 0.0;
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
for ( Genotype genotype : genotypes ) {
|
||||
// we care only about het calls
|
||||
if ( !genotype.getValue().isHet() )
|
||||
if ( !genotype.isHet() )
|
||||
continue;
|
||||
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null )
|
||||
continue;
|
||||
|
||||
if ( vc.isSNP() ) {
|
||||
if ( vc.isSNP() && context.hasBasePileup() ) {
|
||||
final String bases = new String(context.getBasePileup().getBases());
|
||||
if ( bases.length() == 0 )
|
||||
return null;
|
||||
|
|
@ -84,8 +85,8 @@ public class AlleleBalance extends InfoFieldAnnotation {
|
|||
continue;
|
||||
|
||||
// weight the allele balance by genotype quality so that e.g. mis-called homs don't affect the ratio too much
|
||||
ratio += genotype.getValue().getNegLog10PError() * ((double)refCount / (double)(refCount + altCount));
|
||||
totalWeights += genotype.getValue().getNegLog10PError();
|
||||
ratio += genotype.getLog10PError() * ((double)refCount / (double)(refCount + altCount));
|
||||
totalWeights += genotype.getLog10PError();
|
||||
} else if ( vc.isIndel() && context.hasExtendedEventPileup() ) {
|
||||
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
|
||||
if ( indelPileup == null ) {
|
||||
|
|
|
|||
|
|
@ -51,6 +51,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
|
|||
if ( altAlleles.size() == 0 )
|
||||
return null;
|
||||
|
||||
if ( !stratifiedContext.hasBasePileup() )
|
||||
return null;
|
||||
|
||||
final String bases = new String(stratifiedContext.getBasePileup().getBases());
|
||||
if ( bases.length() == 0 )
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -59,6 +59,8 @@ public class BaseCounts extends InfoFieldAnnotation {
|
|||
int[] counts = new int[4];
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
if ( !sample.getValue().hasBasePileup() )
|
||||
continue;
|
||||
for (byte base : sample.getValue().getBasePileup().getBases() ) {
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( index != -1 )
|
||||
|
|
|
|||
|
|
@ -14,7 +14,8 @@ import java.util.List;
|
|||
|
||||
|
||||
/**
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele)
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
|
||||
* Note that the base quality rank sum test can not be calculated for homozygous sites.
|
||||
*/
|
||||
public class BaseQualityRankSumTest extends RankSumTest {
|
||||
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
|
||||
|
|
|
|||
|
|
@ -59,10 +59,8 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn
|
|||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( ! vc.hasGenotypes() )
|
||||
return null;
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
VariantContextUtils.calculateChromosomeCounts(vc, map, true);
|
||||
return map;
|
||||
|
||||
return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap<String, Object>(), true);
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
|
|
|
|||
|
|
@ -49,5 +49,5 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
|
|||
|
||||
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Filtered Depth")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,7 +46,8 @@ import java.util.*;
|
|||
/**
|
||||
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
|
||||
* being seen on only the forward or only the reverse strand) in the reads? More bias is
|
||||
* indicative of false positive calls.
|
||||
* indicative of false positive calls. Note that the fisher strand test may not be
|
||||
* calculated for certain complex indel cases or for multi-allelic sites.
|
||||
*/
|
||||
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private static final String FS = "FS";
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ import java.util.*;
|
|||
/**
|
||||
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
|
||||
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
|
||||
* Note that the Haplotype Score is only calculated for sites with read coverage; also, for SNPs, the
|
||||
* site must be bi-allelic.
|
||||
*/
|
||||
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private final static boolean DEBUG = false;
|
||||
|
|
@ -87,9 +89,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
|
||||
final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage();
|
||||
if (haplotypes != null) {
|
||||
final Set<Map.Entry<String, Genotype>> genotypes = vc.getGenotypes().entrySet();
|
||||
for ( final Map.Entry<String, Genotype> genotype : genotypes ) {
|
||||
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getKey());
|
||||
for ( final Genotype genotype : vc.getGenotypes()) {
|
||||
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( thisContext != null ) {
|
||||
final ReadBackedPileup thisPileup;
|
||||
if (thisContext.hasExtendedEventPileup())
|
||||
|
|
@ -180,12 +181,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
|
||||
|
||||
List<Haplotype>hlist = new ArrayList<Haplotype>();
|
||||
hlist.add(new Haplotype(haplotype1.getBasesAsBytes(), 60));
|
||||
hlist.add(new Haplotype(haplotype1.getBases(), 60));
|
||||
|
||||
for (int k=1; k < haplotypesToCompute; k++) {
|
||||
Haplotype haplotype2 = consensusHaplotypeQueue.poll();
|
||||
if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found
|
||||
hlist.add(new Haplotype(haplotype2.getBasesAsBytes(), 20));
|
||||
hlist.add(new Haplotype(haplotype2.getBases(), 20));
|
||||
}
|
||||
return hlist;
|
||||
} else
|
||||
|
|
@ -229,8 +230,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
}
|
||||
|
||||
private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplotype haplotypeB) {
|
||||
final byte[] a = haplotypeA.getBasesAsBytes();
|
||||
final byte[] b = haplotypeB.getBasesAsBytes();
|
||||
final byte[] a = haplotypeA.getBases();
|
||||
final byte[] b = haplotypeB.getBases();
|
||||
|
||||
if (a.length != b.length) {
|
||||
throw new ReviewedStingException("Haplotypes a and b must be of same length");
|
||||
|
|
@ -313,7 +314,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
|||
// actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != c, then
|
||||
// the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch.
|
||||
// so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n
|
||||
final byte[] haplotypeBases = haplotype.getBasesAsBytes();
|
||||
final byte[] haplotypeBases = haplotype.getBases();
|
||||
final SAMRecord read = p.getRead();
|
||||
byte[] readBases = read.getReadBases();
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -26,20 +27,18 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress
|
|||
|
||||
private static final int MIN_SAMPLES = 10;
|
||||
private static final int MIN_GENOTYPE_QUALITY = 10;
|
||||
private static final int MIN_NEG_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
|
||||
private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10;
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||
return null;
|
||||
|
||||
int refCount = 0;
|
||||
int hetCount = 0;
|
||||
int homCount = 0;
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
Genotype g = genotype.getValue();
|
||||
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.isNoCall() )
|
||||
continue;
|
||||
|
||||
|
|
@ -47,7 +46,7 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress
|
|||
// Right now we just ignore genotypes that are not confident, but this throws off
|
||||
// our HW ratios. More analysis is needed to determine the right thing to do when
|
||||
// the genotyper cannot decide whether a given sample is het or hom var.
|
||||
if ( g.getNegLog10PError() < MIN_NEG_LOG10_PERROR )
|
||||
if ( g.getLog10PError() > MIN_LOG10_PERROR )
|
||||
continue;
|
||||
|
||||
if ( g.isHomRef() )
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -23,7 +24,8 @@ import java.util.Map;
|
|||
*
|
||||
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
|
||||
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
|
||||
* more information.
|
||||
* more information. Note that the Inbreeding Coefficient will not be calculated for files
|
||||
* with fewer than a minimum (generally 10) number of samples.
|
||||
*/
|
||||
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
@ -31,7 +33,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||
return null;
|
||||
|
||||
|
|
@ -50,8 +52,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
double hetCount = 0.0;
|
||||
double homCount = 0.0;
|
||||
int N = 0; // number of samples that have likelihoods
|
||||
for ( final Map.Entry<String, Genotype> genotypeMap : genotypes.entrySet() ) {
|
||||
Genotype g = genotypeMap.getValue();
|
||||
for ( final Genotype g : genotypes ) {
|
||||
if ( g.isNoCall() || !g.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
|
|
|
|||
|
|
@ -3,22 +3,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.samples.Sample;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -30,23 +26,26 @@ import java.util.Map;
|
|||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
private MendelianViolation mendelianViolation = null;
|
||||
private String motherId;
|
||||
private String fatherId;
|
||||
private String childId;
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( mendelianViolation == null ) {
|
||||
if ( walker instanceof VariantAnnotator && ((VariantAnnotator) walker).familyStr != null) {
|
||||
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP );
|
||||
if (checkAndSetSamples(((VariantAnnotator) walker).getSampleDB())) {
|
||||
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
|
||||
}
|
||||
else {
|
||||
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid Family String file (-family) on the command line.");
|
||||
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio.");
|
||||
}
|
||||
}
|
||||
|
||||
Map<String,Object> toRet = new HashMap<String,Object>(1);
|
||||
boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) && vc.getGenotype(mendelianViolation.getSampleChild()).hasLikelihoods() &&
|
||||
vc.hasGenotype(mendelianViolation.getSampleDad()) && vc.getGenotype(mendelianViolation.getSampleDad()).hasLikelihoods() &&
|
||||
vc.hasGenotype(mendelianViolation.getSampleMom()) && vc.getGenotype(mendelianViolation.getSampleMom()).hasLikelihoods();
|
||||
boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() &&
|
||||
vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() &&
|
||||
vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods();
|
||||
if ( hasAppropriateGenotypes )
|
||||
toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc));
|
||||
toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId));
|
||||
|
||||
return toRet;
|
||||
}
|
||||
|
|
@ -55,4 +54,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
|
|||
public List<String> getKeyNames() { return Arrays.asList("MVLR"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
|
||||
|
||||
private boolean checkAndSetSamples(SampleDB db){
|
||||
Set<String> families = db.getFamilyIDs();
|
||||
if(families.size() != 1)
|
||||
return false;
|
||||
|
||||
Set<Sample> family = db.getFamily(families.iterator().next());
|
||||
if(family.size() != 3)
|
||||
return false;
|
||||
|
||||
Iterator<Sample> sampleIter = family.iterator();
|
||||
Sample sample;
|
||||
for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){
|
||||
if(sample.getParents().size()==2){
|
||||
motherId = sample.getMaternalID();
|
||||
fatherId = sample.getPaternalID();
|
||||
childId = sample.getID();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
|
||||
* Note that the mapping quality rank sum test can not be calculated for homozygous sites.
|
||||
*/
|
||||
public class MappingQualityRankSumTest extends RankSumTest {
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -19,7 +20,8 @@ import java.util.Map;
|
|||
/**
|
||||
* Variant confidence (given as (AB+BB)/AA from the PLs) / unfiltered depth.
|
||||
*
|
||||
* Low scores are indicative of false positive calls and artifacts.
|
||||
* Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing
|
||||
* reads associated with the samples with polymorphic genotypes.
|
||||
*/
|
||||
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
@ -27,19 +29,19 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
return null;
|
||||
|
||||
int depth = 0;
|
||||
|
||||
for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
for ( final Genotype genotype : genotypes ) {
|
||||
|
||||
// we care only about variant calls with likelihoods
|
||||
if ( genotype.getValue().isHomRef() )
|
||||
if ( !genotype.isHet() && !genotype.isHomVar() )
|
||||
continue;
|
||||
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null )
|
||||
continue;
|
||||
|
||||
|
|
@ -49,7 +51,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( depth == 0 )
|
||||
return null;
|
||||
|
||||
double QD = 10.0 * vc.getNegLog10PError() / (double)depth;
|
||||
double QD = -10.0 * vc.getLog10PError() / (double)depth;
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -32,7 +33,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
if ( stratifiedContexts.size() == 0 )
|
||||
return null;
|
||||
|
||||
final Map<String, Genotype> genotypes = vc.getGenotypes();
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
return null;
|
||||
|
||||
|
|
@ -42,8 +43,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
|
||||
if (vc.isSNP() && vc.isBiallelic()) {
|
||||
// todo - no current support for multiallelic snps
|
||||
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -52,8 +53,8 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
|
|||
}
|
||||
else if (vc.isIndel() || vc.isMixed()) {
|
||||
|
||||
for ( final Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getKey());
|
||||
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
|
||||
* Note that the read position rank sum test can not be calculated for homozygous sites.
|
||||
*/
|
||||
public class ReadPosRankSumTest extends RankSumTest {
|
||||
|
||||
|
|
|
|||
|
|
@ -47,11 +47,11 @@ import java.util.Map;
|
|||
public class SampleList extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( vc.isMonomorphic() || !vc.hasGenotypes() )
|
||||
if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() )
|
||||
return null;
|
||||
|
||||
StringBuffer samples = new StringBuffer();
|
||||
for ( Genotype genotype : vc.getGenotypesSortedByName() ) {
|
||||
for ( Genotype genotype : vc.getGenotypesOrderedByName() ) {
|
||||
if ( genotype.isCalled() && !genotype.isHomRef() ){
|
||||
if ( samples.length() > 0 )
|
||||
samples.append(",");
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
// We refuse to parse SnpEff output files generated by unsupported versions, or
|
||||
// lacking a SnpEff version number in the VCF header:
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" };
|
||||
public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" };
|
||||
public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion";
|
||||
public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd";
|
||||
|
||||
|
|
@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
public enum InfoFieldKey {
|
||||
EFFECT_KEY ("SNPEFF_EFFECT", -1),
|
||||
IMPACT_KEY ("SNPEFF_IMPACT", 0),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 7),
|
||||
FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1);
|
||||
FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 8);
|
||||
|
||||
// Actual text of the key
|
||||
private final String keyName;
|
||||
|
|
@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
// are validated against this list.
|
||||
public enum EffectType {
|
||||
// High-impact effects:
|
||||
FRAME_SHIFT (EffectFunctionalClass.NONE, false),
|
||||
STOP_GAINED (EffectFunctionalClass.NONSENSE, false),
|
||||
START_LOST (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false),
|
||||
EXON_DELETED (EffectFunctionalClass.NONE, false),
|
||||
STOP_LOST (EffectFunctionalClass.NONE, false),
|
||||
SPLICE_SITE_ACCEPTOR,
|
||||
SPLICE_SITE_DONOR,
|
||||
START_LOST,
|
||||
EXON_DELETED,
|
||||
FRAME_SHIFT,
|
||||
STOP_GAINED,
|
||||
STOP_LOST,
|
||||
|
||||
// Moderate-impact effects:
|
||||
NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false),
|
||||
CODON_CHANGE (EffectFunctionalClass.NONE, false),
|
||||
CODON_INSERTION (EffectFunctionalClass.NONE, false),
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false),
|
||||
CODON_DELETION (EffectFunctionalClass.NONE, false),
|
||||
CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false),
|
||||
UTR_5_DELETED (EffectFunctionalClass.NONE, false),
|
||||
UTR_3_DELETED (EffectFunctionalClass.NONE, false),
|
||||
NON_SYNONYMOUS_CODING,
|
||||
CODON_CHANGE,
|
||||
CODON_INSERTION,
|
||||
CODON_CHANGE_PLUS_CODON_INSERTION,
|
||||
CODON_DELETION,
|
||||
CODON_CHANGE_PLUS_CODON_DELETION,
|
||||
UTR_5_DELETED,
|
||||
UTR_3_DELETED,
|
||||
|
||||
// Low-impact effects:
|
||||
SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false),
|
||||
SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
|
||||
NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false),
|
||||
SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
|
||||
NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false),
|
||||
START_GAINED (EffectFunctionalClass.NONE, false),
|
||||
SYNONYMOUS_START,
|
||||
NON_SYNONYMOUS_START,
|
||||
START_GAINED,
|
||||
SYNONYMOUS_CODING,
|
||||
SYNONYMOUS_STOP,
|
||||
NON_SYNONYMOUS_STOP,
|
||||
|
||||
// Modifiers:
|
||||
NONE (EffectFunctionalClass.NONE, true),
|
||||
CHROMOSOME (EffectFunctionalClass.NONE, true),
|
||||
INTERGENIC (EffectFunctionalClass.NONE, true),
|
||||
UPSTREAM (EffectFunctionalClass.NONE, true),
|
||||
UTR_5_PRIME (EffectFunctionalClass.NONE, true),
|
||||
CDS (EffectFunctionalClass.NONE, true),
|
||||
GENE (EffectFunctionalClass.NONE, true),
|
||||
TRANSCRIPT (EffectFunctionalClass.NONE, true),
|
||||
EXON (EffectFunctionalClass.NONE, true),
|
||||
INTRON (EffectFunctionalClass.NONE, true),
|
||||
UTR_3_PRIME (EffectFunctionalClass.NONE, true),
|
||||
DOWNSTREAM (EffectFunctionalClass.NONE, true),
|
||||
INTRON_CONSERVED (EffectFunctionalClass.NONE, true),
|
||||
INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true),
|
||||
REGULATION (EffectFunctionalClass.NONE, true),
|
||||
CUSTOM (EffectFunctionalClass.NONE, true),
|
||||
WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true);
|
||||
|
||||
private final EffectFunctionalClass functionalClass;
|
||||
private final boolean isModifier;
|
||||
|
||||
EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) {
|
||||
this.functionalClass = functionalClass;
|
||||
this.isModifier = isModifier;
|
||||
}
|
||||
|
||||
public EffectFunctionalClass getFunctionalClass() {
|
||||
return functionalClass;
|
||||
}
|
||||
|
||||
public boolean isModifier() {
|
||||
return isModifier;
|
||||
}
|
||||
NONE,
|
||||
CHROMOSOME,
|
||||
CUSTOM,
|
||||
CDS,
|
||||
GENE,
|
||||
TRANSCRIPT,
|
||||
EXON,
|
||||
INTRON_CONSERVED,
|
||||
UTR_5_PRIME,
|
||||
UTR_3_PRIME,
|
||||
DOWNSTREAM,
|
||||
INTRAGENIC,
|
||||
INTERGENIC,
|
||||
INTERGENIC_CONSERVED,
|
||||
UPSTREAM,
|
||||
REGULATION,
|
||||
INTRON
|
||||
}
|
||||
|
||||
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of
|
||||
// classifying some of the LOW impact effects as MODIFIERs.
|
||||
// SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER.
|
||||
public enum EffectImpact {
|
||||
MODIFIER (0),
|
||||
LOW (1),
|
||||
|
|
@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
UNKNOWN
|
||||
}
|
||||
|
||||
// We assign a functional class to each SnpEff effect.
|
||||
// SnpEff assigns a functional class to each effect.
|
||||
public enum EffectFunctionalClass {
|
||||
NONE (0),
|
||||
SILENT (1),
|
||||
|
|
@ -221,6 +204,11 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
}
|
||||
|
||||
public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
throw new UserException("SnpEff support is currently disabled in the GATK until SnpEff 2.0.4 is officially released " +
|
||||
"due to a serious issue with SnpEff versions prior to 2.0.4. Please see this page for more details: " +
|
||||
"http://www.broadinstitute.org/gsa/wiki/index.php/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator");
|
||||
|
||||
/*
|
||||
// Make sure that we actually have a valid SnpEff rod binding (just in case the user specified -A SnpEff
|
||||
// without providing a SnpEff rod via --snpEffFile):
|
||||
validateRodBinding(walker.getSnpEffRodBinding());
|
||||
|
|
@ -240,6 +228,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
// mistaken in the future for a SnpEff output file:
|
||||
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_VERSION_LINE_KEY, snpEffVersionLine.getValue()));
|
||||
headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue()));
|
||||
*/
|
||||
}
|
||||
|
||||
public Map<String, Object> annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc ) {
|
||||
|
|
@ -379,13 +368,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
public List<String> getKeyNames() {
|
||||
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
|
||||
InfoFieldKey.IMPACT_KEY.getKeyName(),
|
||||
InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(),
|
||||
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
|
||||
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName()
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName()
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -393,13 +382,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
return Arrays.asList(
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values()))
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -409,6 +398,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
protected static class SnpEffEffect {
|
||||
private EffectType effect;
|
||||
private EffectImpact impact;
|
||||
private EffectFunctionalClass functionalClass;
|
||||
private String codonChange;
|
||||
private String aminoAcidChange;
|
||||
private String geneName;
|
||||
|
|
@ -420,16 +410,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
private String parseError = null;
|
||||
private boolean isWellFormed = true;
|
||||
|
||||
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10;
|
||||
private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10;
|
||||
private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11;
|
||||
|
||||
// Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header,
|
||||
// errors come after warnings, not vice versa:
|
||||
private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1;
|
||||
private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1;
|
||||
// If there is either a warning OR an error, it will be in the last field. If there is both
|
||||
// a warning AND an error, the warning will be in the second-to-last field, and the error will
|
||||
// be in the last field.
|
||||
private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1;
|
||||
private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2;
|
||||
private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1;
|
||||
|
||||
private static final int SNPEFF_CODING_FIELD_INDEX = 5;
|
||||
// Position of the field indicating whether the effect is coding or non-coding. This field is used
|
||||
// in selecting the most significant effect, but is not included in the annotations we return
|
||||
// since it can be deduced from the SNPEFF_GENE_BIOTYPE field.
|
||||
private static final int SNPEFF_CODING_FIELD_INDEX = 6;
|
||||
|
||||
public SnpEffEffect ( String effectName, String[] effectMetadata ) {
|
||||
parseEffectName(effectName);
|
||||
|
|
@ -447,11 +442,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
private void parseEffectMetadata ( String[] effectMetadata ) {
|
||||
if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) {
|
||||
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) {
|
||||
parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX]));
|
||||
if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following warning or error: \"%s\"",
|
||||
effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR]));
|
||||
}
|
||||
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX]));
|
||||
else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) {
|
||||
parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"",
|
||||
effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR],
|
||||
effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR]));
|
||||
}
|
||||
else {
|
||||
parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d",
|
||||
|
|
@ -461,23 +459,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
return;
|
||||
}
|
||||
|
||||
if ( effect != null && effect.isModifier() ) {
|
||||
impact = EffectImpact.MODIFIER;
|
||||
// The impact field will never be empty, and should always contain one of the enumerated values:
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
}
|
||||
else {
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
}
|
||||
|
||||
// The functional class field will be empty when the effect has no functional class associated with it:
|
||||
if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) {
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]));
|
||||
}
|
||||
}
|
||||
else {
|
||||
functionalClass = EffectFunctionalClass.NONE;
|
||||
}
|
||||
|
||||
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
|
||||
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
|
||||
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
|
||||
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
|
||||
|
||||
// The coding field will be empty when SnpEff has no coding info for the effect:
|
||||
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
|
||||
try {
|
||||
coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]);
|
||||
|
|
@ -534,7 +542,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
return true;
|
||||
}
|
||||
else if ( impact.isSameImpactAs(other.impact) ) {
|
||||
return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass());
|
||||
return functionalClass.isHigherPriorityThan(other.functionalClass);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
@ -545,13 +553,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio
|
|||
|
||||
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
|
||||
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
|
||||
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
|
||||
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
|
||||
addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString());
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,90 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.samples.Sample;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 11/14/11
|
||||
*/
|
||||
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
private Set<Sample> trios = null;
|
||||
private final static int REF = 0;
|
||||
private final static int HET = 1;
|
||||
private final static int HOM = 2;
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( trios == null ) {
|
||||
if ( walker instanceof VariantAnnotator ) {
|
||||
trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents();
|
||||
} else {
|
||||
throw new UserException("Transmission disequilibrium test annotation can only be used from the Variant Annotator and requires a valid ped file be passed in.");
|
||||
}
|
||||
}
|
||||
|
||||
final Map<String,Object> toRet = new HashMap<String,Object>(1);
|
||||
final HashSet<Sample> triosToTest = new HashSet<Sample>();
|
||||
|
||||
for( final Sample child : trios) {
|
||||
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
|
||||
vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
|
||||
vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
|
||||
if ( hasAppropriateGenotypes ) {
|
||||
triosToTest.add(child);
|
||||
}
|
||||
}
|
||||
|
||||
toRet.put("TDT", calculateTDT( vc, triosToTest ));
|
||||
|
||||
return toRet;
|
||||
}
|
||||
|
||||
// return the descriptions used for the VCF INFO meta field
|
||||
public List<String> getKeyNames() { return Arrays.asList("TDT"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", 1, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); }
|
||||
|
||||
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
|
||||
private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
|
||||
|
||||
final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET);
|
||||
final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET);
|
||||
final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET);
|
||||
final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET);
|
||||
final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF);
|
||||
final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF);
|
||||
|
||||
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
|
||||
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
|
||||
return (numer * numer) / denom;
|
||||
}
|
||||
|
||||
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) {
|
||||
final double likelihoodVector[] = new double[triosToTest.size()];
|
||||
int iii = 0;
|
||||
for( final Sample child : triosToTest ) {
|
||||
final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
|
||||
final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
|
||||
final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
|
||||
likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx];
|
||||
}
|
||||
|
||||
return MathUtils.sumLog10(likelihoodVector);
|
||||
}
|
||||
}
|
||||
|
|
@ -32,11 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.samples.SampleDB;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
|
|
@ -71,8 +69,9 @@ import java.util.*;
|
|||
* -T VariantAnnotator \
|
||||
* -I input.bam \
|
||||
* -o output.vcf \
|
||||
* -A DepthOfCoverage
|
||||
* -A DepthOfCoverage \
|
||||
* --variant input.vcf \
|
||||
* -L input.vcf \
|
||||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
|
|
@ -164,35 +163,32 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
|
||||
protected Boolean LIST = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||
protected String ASSUME_SINGLE_SAMPLE = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
|
||||
protected boolean indelsOnly = false;
|
||||
|
||||
@Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation")
|
||||
public String familyStr = null;
|
||||
|
||||
@Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio")
|
||||
public double minGenotypeQualityP = 0.0;
|
||||
|
||||
@Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp tracks that exactly match both reference and alternate alleles will be counted as concordant", required=false)
|
||||
private boolean requireStrictAlleleMatch = false;
|
||||
|
||||
private VariantAnnotatorEngine engine;
|
||||
|
||||
private Collection<VariantContext> indelBufferContext;
|
||||
|
||||
|
||||
private void listAnnotationsAndExit() {
|
||||
System.out.println("\nStandard annotations in the list below are marked with a '*'.");
|
||||
List<Class<? extends InfoFieldAnnotation>> infoAnnotationClasses = new PluginManager<InfoFieldAnnotation>(InfoFieldAnnotation.class).getPlugins();
|
||||
System.out.println("\nAvailable annotations for the VCF INFO field:");
|
||||
for (int i = 0; i < infoAnnotationClasses.size(); i++)
|
||||
System.out.println("\t" + infoAnnotationClasses.get(i).getSimpleName());
|
||||
System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName());
|
||||
System.out.println();
|
||||
List<Class<? extends GenotypeAnnotation>> genotypeAnnotationClasses = new PluginManager<GenotypeAnnotation>(GenotypeAnnotation.class).getPlugins();
|
||||
System.out.println("\nAvailable annotations for the VCF FORMAT field:");
|
||||
for (int i = 0; i < genotypeAnnotationClasses.size(); i++)
|
||||
System.out.println("\t" + genotypeAnnotationClasses.get(i).getSimpleName());
|
||||
System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName());
|
||||
System.out.println();
|
||||
System.out.println("\nAvailable classes/groups of annotations:");
|
||||
for ( Class c : new PluginManager<AnnotationType>(AnnotationType.class).getInterfaces() )
|
||||
|
|
@ -213,16 +209,12 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
List<String> rodName = Arrays.asList(variantCollection.variants.getName());
|
||||
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
|
||||
|
||||
// if there are no valid samples, warn the user
|
||||
if ( samples.size() == 0 ) {
|
||||
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
|
||||
}
|
||||
|
||||
if ( USE_ALL_ANNOTATIONS )
|
||||
engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit());
|
||||
else
|
||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
engine.initializeExpressions(expressionsToUse);
|
||||
engine.setRequireStrictAlleleMatch(requireStrictAlleleMatch);
|
||||
|
||||
// setup the header fields
|
||||
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
|
||||
|
|
@ -232,8 +224,33 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
if ( isUniqueHeaderLine(line, hInfo) )
|
||||
hInfo.add(line);
|
||||
}
|
||||
for ( String expression : expressionsToUse )
|
||||
hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
|
||||
// for the expressions, pull the info header line from the header of the resource rod
|
||||
for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) {
|
||||
// special case the ID field
|
||||
if ( expression.fieldName.equals("ID") ) {
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource"));
|
||||
continue;
|
||||
}
|
||||
VCFInfoHeaderLine targetHeaderLine = null;
|
||||
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
|
||||
if ( infoline.getName().equals(expression.fieldName) ) {
|
||||
targetHeaderLine = infoline;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( targetHeaderLine != null ) {
|
||||
if ( targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER )
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
|
||||
else
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription()));
|
||||
} else {
|
||||
hInfo.add(new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource"));
|
||||
}
|
||||
}
|
||||
|
||||
engine.invokeAnnotationInitializationMethods(hInfo);
|
||||
|
||||
|
|
@ -301,9 +318,9 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
Map<String, AlignmentContext> stratifiedContexts;
|
||||
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
|
||||
if ( ! context.hasExtendedEventPileup() ) {
|
||||
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup(), ASSUME_SINGLE_SAMPLE);
|
||||
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
|
||||
} else {
|
||||
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup(), ASSUME_SINGLE_SAMPLE);
|
||||
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup());
|
||||
}
|
||||
if ( stratifiedContexts != null ) {
|
||||
annotatedVCs = new ArrayList<VariantContext>(VCs.size());
|
||||
|
|
|
|||
|
|
@ -34,7 +34,9 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -45,24 +47,26 @@ public class VariantAnnotatorEngine {
|
|||
private List<GenotypeAnnotation> requestedGenotypeAnnotations;
|
||||
private List<VAExpression> requestedExpressions = new ArrayList<VAExpression>();
|
||||
|
||||
private HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
|
||||
private AnnotatorCompatibleWalker walker;
|
||||
private GenomeAnalysisEngine toolkit;
|
||||
private final HashMap<RodBinding<VariantContext>, String> dbAnnotations = new HashMap<RodBinding<VariantContext>, String>();
|
||||
private final AnnotatorCompatibleWalker walker;
|
||||
private final GenomeAnalysisEngine toolkit;
|
||||
|
||||
private static class VAExpression {
|
||||
private boolean requireStrictAlleleMatch = false;
|
||||
|
||||
protected static class VAExpression {
|
||||
|
||||
public String fullName, fieldName;
|
||||
public RodBinding<VariantContext> binding;
|
||||
|
||||
public VAExpression(String fullEpression, List<RodBinding<VariantContext>> bindings) {
|
||||
int indexOfDot = fullEpression.lastIndexOf(".");
|
||||
public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings) {
|
||||
int indexOfDot = fullExpression.lastIndexOf(".");
|
||||
if ( indexOfDot == -1 )
|
||||
throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format");
|
||||
throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format");
|
||||
|
||||
fullName = fullEpression;
|
||||
fieldName = fullEpression.substring(indexOfDot+1);
|
||||
fullName = fullExpression;
|
||||
fieldName = fullExpression.substring(indexOfDot+1);
|
||||
|
||||
String bindingName = fullEpression.substring(0, indexOfDot);
|
||||
String bindingName = fullExpression.substring(0, indexOfDot);
|
||||
for ( RodBinding<VariantContext> rod : bindings ) {
|
||||
if ( rod.getName().equals(bindingName) ) {
|
||||
binding = rod;
|
||||
|
|
@ -97,6 +101,8 @@ public class VariantAnnotatorEngine {
|
|||
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
||||
}
|
||||
|
||||
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
|
||||
|
||||
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
||||
AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||
requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse);
|
||||
|
|
@ -159,12 +165,15 @@ public class VariantAnnotatorEngine {
|
|||
return descriptions;
|
||||
}
|
||||
|
||||
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
public void setRequireStrictAlleleMatch( final boolean requireStrictAlleleMatch ) {
|
||||
this.requireStrictAlleleMatch = requireStrictAlleleMatch;
|
||||
}
|
||||
|
||||
public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
|
||||
|
||||
// annotate db occurrences
|
||||
annotateDBs(tracker, ref, vc, infoAnnotations);
|
||||
vc = annotateDBs(tracker, ref, vc, infoAnnotations);
|
||||
|
||||
// annotate expressions where available
|
||||
annotateExpressions(tracker, ref, infoAnnotations);
|
||||
|
|
@ -177,24 +186,24 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
|
||||
// generate a new annotated VC
|
||||
final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations);
|
||||
|
||||
// annotate genotypes, creating another new VC in the process
|
||||
return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc));
|
||||
return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make();
|
||||
}
|
||||
|
||||
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
||||
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
||||
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
|
||||
// annotate dbsnp id if available and not already there
|
||||
if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) )
|
||||
infoAnnotations.put(VariantContext.ID_KEY, rsID);
|
||||
if ( rsID != null && vc.emptyID() )
|
||||
vc = new VariantContextBuilder(vc).id(rsID).make();
|
||||
} else {
|
||||
boolean overlapsComp = false;
|
||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
||||
if ( !comp.isFiltered() ) {
|
||||
if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) {
|
||||
overlapsComp = true;
|
||||
break;
|
||||
}
|
||||
|
|
@ -202,6 +211,8 @@ public class VariantAnnotatorEngine {
|
|||
infoAnnotations.put(dbSet.getValue(), overlapsComp);
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
|
||||
|
|
@ -211,21 +222,25 @@ public class VariantAnnotatorEngine {
|
|||
continue;
|
||||
|
||||
VariantContext vc = VCs.iterator().next();
|
||||
if ( vc.hasAttribute(expression.fieldName) )
|
||||
// special-case the ID field
|
||||
if ( expression.fieldName.equals("ID") ) {
|
||||
if ( vc.hasID() )
|
||||
infoAnnotations.put(expression.fullName, vc.getID());
|
||||
} else if ( vc.hasAttribute(expression.fieldName) ) {
|
||||
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Genotype> annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
if ( requestedGenotypeAnnotations.size() == 0 )
|
||||
return vc.getGenotypes();
|
||||
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc.getNSamples());
|
||||
for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
|
||||
Genotype genotype = g.getValue();
|
||||
AlignmentContext context = stratifiedContexts.get(g.getKey());
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
|
||||
for ( final Genotype genotype : vc.getGenotypes() ) {
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null ) {
|
||||
genotypes.put(g.getKey(), genotype);
|
||||
genotypes.add(genotype);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -235,7 +250,7 @@ public class VariantAnnotatorEngine {
|
|||
if ( result != null )
|
||||
genotypeAnnotations.putAll(result);
|
||||
}
|
||||
genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
|
||||
genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
|
|
|
|||
|
|
@ -36,10 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -125,7 +122,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
protected static String line = null;
|
||||
|
||||
private final double MIN_PROB_ERROR = 0.000001;
|
||||
private final double MAX_GENOTYPE_QUALITY = 6.0;
|
||||
private final double MAX_GENOTYPE_QUALITY = -6.0;
|
||||
|
||||
public void initialize() {
|
||||
|
||||
|
|
@ -181,8 +178,8 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// ignore places where we don't have a variant
|
||||
if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null)
|
||||
{
|
||||
vcfWriter.add(vc_input);
|
||||
return 1;
|
||||
vcfWriter.add(vc_input);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -190,8 +187,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
byte refByte = ref.getBase();
|
||||
|
||||
// make new Genotypes based on Beagle results
|
||||
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc_input.getGenotypes().size());
|
||||
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc_input.getGenotypes().size());
|
||||
|
||||
// for each genotype, create a new object with Beagle information on it
|
||||
|
||||
|
|
@ -200,15 +196,13 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
Double alleleFrequencyH = 0.0;
|
||||
int beagleVarCounts = 0;
|
||||
|
||||
Map<String,Genotype> hapmapGenotypes = null;
|
||||
GenotypesContext hapmapGenotypes = null;
|
||||
|
||||
if (vc_comp != null) {
|
||||
hapmapGenotypes = vc_comp.getGenotypes();
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
|
||||
|
||||
Genotype g = originalGenotypes.getValue();
|
||||
for ( final Genotype g : vc_input.getGenotypes() ) {
|
||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||
|
||||
boolean genotypeIsPhased = true;
|
||||
|
|
@ -218,7 +212,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// use sample as key into genotypes structure
|
||||
if (vc_comp != null) {
|
||||
|
||||
if (vc_input.getGenotypes().containsKey(sample) && hapmapGenotypes.containsKey(sample)) {
|
||||
if (vc_input.getGenotypes().containsSample(sample) && hapmapGenotypes.containsSample(sample)) {
|
||||
|
||||
Genotype hapmapGenotype = hapmapGenotypes.get(sample);
|
||||
if (hapmapGenotype.isCalled()){
|
||||
|
|
@ -255,9 +249,9 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
Allele bglAlleleA, bglAlleleB;
|
||||
|
||||
if (alleleA.matches(refString))
|
||||
bglAlleleA = Allele.create(alleleA,true);
|
||||
bglAlleleA = Allele.create(alleleA,true);
|
||||
else
|
||||
bglAlleleA = Allele.create(alleleA,false);
|
||||
bglAlleleA = Allele.create(alleleA,false);
|
||||
|
||||
if (alleleB.matches(refString))
|
||||
bglAlleleB = Allele.create(alleleB,true);
|
||||
|
|
@ -286,7 +280,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// deal with numerical errors coming from limited formatting value on Beagle output files
|
||||
if (probWrongGenotype > 1 - MIN_PROB_ERROR)
|
||||
probWrongGenotype = 1 - MIN_PROB_ERROR;
|
||||
|
||||
|
||||
if (1-probWrongGenotype < noCallThreshold) {
|
||||
// quality is bad: don't call genotype
|
||||
alleles.clear();
|
||||
|
|
@ -298,7 +292,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
if (probWrongGenotype < MIN_PROB_ERROR)
|
||||
genotypeQuality = MAX_GENOTYPE_QUALITY;
|
||||
else
|
||||
genotypeQuality = -log10(probWrongGenotype);
|
||||
genotypeQuality = log10(probWrongGenotype);
|
||||
|
||||
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
|
||||
|
||||
|
|
@ -329,47 +323,40 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
else {
|
||||
originalAttributes.put("OG",".");
|
||||
}
|
||||
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||
Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
|
||||
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
|
||||
beagleVarCounts++;
|
||||
}
|
||||
|
||||
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
|
||||
|
||||
genotypes.add(imputedGenotype);
|
||||
}
|
||||
|
||||
VariantContext filteredVC;
|
||||
if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES )
|
||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
|
||||
else {
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes);
|
||||
if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) {
|
||||
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
|
||||
removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0)));
|
||||
filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
|
||||
builder.alleles(new HashSet<Allele>(Arrays.asList(vc_input.getReference()))).filters(removedFilters);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
|
||||
// re-compute chromosome counts
|
||||
VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false);
|
||||
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||
|
||||
// Get Hapmap AC and AF
|
||||
if (vc_comp != null) {
|
||||
attributes.put("ACH", alleleCountH.toString() );
|
||||
attributes.put("ANH", chrCountH.toString() );
|
||||
attributes.put("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) );
|
||||
builder.attribute("ACH", alleleCountH.toString() );
|
||||
builder.attribute("ANH", chrCountH.toString() );
|
||||
builder.attribute("AFH", String.format("%4.2f", (double)alleleCountH/chrCountH) );
|
||||
|
||||
}
|
||||
|
||||
attributes.put("NumGenotypesChanged", numGenotypesChangedByBeagle );
|
||||
builder.attribute("NumGenotypesChanged", numGenotypesChangedByBeagle );
|
||||
if( !beagleR2Feature.getR2value().equals(Double.NaN) ) {
|
||||
attributes.put("R2", beagleR2Feature.getR2value().toString() );
|
||||
builder.attribute("R2", beagleR2Feature.getR2value().toString() );
|
||||
}
|
||||
|
||||
|
||||
vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes));
|
||||
|
||||
vcfWriter.add(builder.make());
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
|
|||
|
|
@ -39,10 +39,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -204,7 +201,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
|
||||
if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
|
||||
if ( bootstrapVCFOutput != null ) {
|
||||
bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER));
|
||||
bootstrapVCFOutput.add(new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make());
|
||||
}
|
||||
bootstrapSetSize++;
|
||||
return true;
|
||||
|
|
@ -245,18 +242,18 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
if ( markers != null ) markers.append("\n");
|
||||
|
||||
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
|
||||
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||
GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
|
||||
GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||
for ( String sample : samples ) {
|
||||
boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;
|
||||
|
||||
Genotype genotype;
|
||||
boolean isValidation;
|
||||
// use sample as key into genotypes structure
|
||||
if ( preferredGenotypes.keySet().contains(sample) ) {
|
||||
if ( preferredGenotypes.containsSample(sample) ) {
|
||||
genotype = preferredGenotypes.get(sample);
|
||||
isValidation = isValidationSite;
|
||||
} else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) {
|
||||
} else if ( otherGenotypes != null && otherGenotypes.containsSample(sample) ) {
|
||||
genotype = otherGenotypes.get(sample);
|
||||
isValidation = ! isValidationSite;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -31,7 +31,6 @@ import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
|||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -68,7 +67,8 @@ public class GATKReportDiffableReader implements DiffableReader {
|
|||
for ( GATKReportColumn column : table.getColumns().values() ) {
|
||||
DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot);
|
||||
|
||||
columnRoot.add("Width", column.getColumnWidth());
|
||||
columnRoot.add("Width", column.getColumnFormat().getWidth());
|
||||
// NOTE: as the values are trimmed during parsing left/right alignment is not currently preserved
|
||||
columnRoot.add("Displayable", column.isDisplayable());
|
||||
|
||||
int n = 1;
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.diffengine;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
|
|
@ -32,7 +33,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -46,6 +46,8 @@ import java.util.Map;
|
|||
* Class implementing diffnode reader for VCF
|
||||
*/
|
||||
public class VCFDiffableReader implements DiffableReader {
|
||||
private static Logger logger = Logger.getLogger(VCFDiffableReader.class);
|
||||
|
||||
@Override
|
||||
public String getName() { return "VCF"; }
|
||||
|
||||
|
|
@ -68,7 +70,10 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
String key = headerLine.getKey();
|
||||
if ( headerLine instanceof VCFNamedHeaderLine )
|
||||
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
|
||||
root.add(key, headerLine.toString());
|
||||
if ( root.hasElement(key) )
|
||||
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
|
||||
else
|
||||
root.add(key, headerLine.toString());
|
||||
}
|
||||
|
||||
String line = lineReader.readLine();
|
||||
|
|
@ -90,22 +95,22 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
// add fields
|
||||
vcRoot.add("CHROM", vc.getChr());
|
||||
vcRoot.add("POS", vc.getStart());
|
||||
vcRoot.add("ID", vc.hasID() ? vc.getID() : VCFConstants.MISSING_VALUE_v4);
|
||||
vcRoot.add("ID", vc.getID());
|
||||
vcRoot.add("REF", vc.getReference());
|
||||
vcRoot.add("ALT", vc.getAlternateAlleles());
|
||||
vcRoot.add("QUAL", vc.hasNegLog10PError() ? vc.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4);
|
||||
vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4);
|
||||
vcRoot.add("FILTER", vc.getFilters());
|
||||
|
||||
// add info fields
|
||||
for (Map.Entry<String, Object> attribute : vc.getAttributes().entrySet()) {
|
||||
if ( ! attribute.getKey().startsWith("_") && ! attribute.getKey().equals(VariantContext.ID_KEY))
|
||||
if ( ! attribute.getKey().startsWith("_") )
|
||||
vcRoot.add(attribute.getKey(), attribute.getValue());
|
||||
}
|
||||
|
||||
for (Genotype g : vc.getGenotypes().values() ) {
|
||||
for (Genotype g : vc.getGenotypes() ) {
|
||||
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
|
||||
gRoot.add("GT", g.getGenotypeString());
|
||||
gRoot.add("GQ", g.hasNegLog10PError() ? g.getNegLog10PError() * 10 : VCFConstants.MISSING_VALUE_v4 );
|
||||
gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 );
|
||||
|
||||
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) {
|
||||
if ( ! attribute.getKey().startsWith("_") )
|
||||
|
|
@ -129,6 +134,6 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
|
||||
@Override
|
||||
public boolean canRead(File file) {
|
||||
return AbstractVCFCodec.canDecodeFile(file, VCFCodec.VCF4_MAGIC_HEADER);
|
||||
return AbstractVCFCodec.canDecodeFile(file.getPath(), VCFCodec.VCF4_MAGIC_HEADER);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,9 +36,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -224,7 +222,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||
filters.add(MASK_NAME);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
}
|
||||
|
||||
FiltrationContext varContext = new FiltrationContext(ref, vc);
|
||||
|
|
@ -267,7 +265,7 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
(vc.getFilters() == null || !vc.getFilters().contains(MASK_NAME)) ) { // the filter hasn't already been applied
|
||||
Set<String> filters = new LinkedHashSet<String>(vc.getFilters());
|
||||
filters.add(MASK_NAME);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
vc = new VariantContextBuilder(vc).filters(filters).make();
|
||||
}
|
||||
|
||||
return vc;
|
||||
|
|
@ -279,20 +277,15 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
if ( context == null )
|
||||
return;
|
||||
|
||||
VariantContext vc = context.getVariantContext();
|
||||
final VariantContext vc = context.getVariantContext();
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
|
||||
// make new Genotypes based on filters
|
||||
Map<String, Genotype> genotypes;
|
||||
if ( genotypeFilterExps.size() == 0 ) {
|
||||
genotypes = null;
|
||||
} else {
|
||||
genotypes = new HashMap<String, Genotype>(vc.getGenotypes().size());
|
||||
if ( genotypeFilterExps.size() > 0 ) {
|
||||
GenotypesContext genotypes = GenotypesContext.create(vc.getGenotypes().size());
|
||||
|
||||
// for each genotype, check filters then create a new object
|
||||
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
||||
|
||||
Genotype g = genotype.getValue();
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
if ( g.isCalled() ) {
|
||||
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
|
||||
|
||||
|
|
@ -300,11 +293,13 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
if ( VariantContextUtils.match(vc, g, exp) )
|
||||
filters.add(exp.name);
|
||||
}
|
||||
genotypes.put(genotype.getKey(), new Genotype(genotype.getKey(), g.getAlleles(), g.getNegLog10PError(), filters, g.getAttributes(), g.isPhased()));
|
||||
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased()));
|
||||
} else {
|
||||
genotypes.put(genotype.getKey(), g);
|
||||
genotypes.add(g);
|
||||
}
|
||||
}
|
||||
|
||||
builder.genotypes(genotypes);
|
||||
}
|
||||
|
||||
// make a new variant context based on filters
|
||||
|
|
@ -324,14 +319,9 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
filters.add(exp.name);
|
||||
}
|
||||
}
|
||||
builder.filters(filters);
|
||||
|
||||
VariantContext filteredVC;
|
||||
if ( genotypes == null )
|
||||
filteredVC = VariantContext.modifyFilters(vc, filters);
|
||||
else
|
||||
filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes());
|
||||
|
||||
writer.add(filteredVC);
|
||||
writer.add(builder.make());
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
|
|
|
|||
|
|
@ -26,16 +26,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -47,8 +42,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT,
|
||||
/** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */
|
||||
GRID_SEARCH
|
||||
}
|
||||
|
||||
protected int N;
|
||||
|
|
@ -58,7 +51,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
|
||||
protected enum GenotypeType { AA, AB, BB }
|
||||
|
||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
|
|
@ -68,24 +61,12 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param GLs genotype likelihoods
|
||||
* @param Alleles Alleles corresponding to GLs
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||
* @param GLs genotype likelihoods
|
||||
* @param Alleles Alleles corresponding to GLs
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store likelihoods results
|
||||
*/
|
||||
protected abstract void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> Alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors);
|
||||
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
protected abstract Map<String, Genotype> assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood);
|
||||
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
|
||||
double[][] log10AlleleFrequencyPriors,
|
||||
AlleleFrequencyCalculationResult result);
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*/
|
||||
public class AlleleFrequencyCalculationResult {
|
||||
|
||||
// IMPORTANT NOTE:
|
||||
// These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N).
|
||||
// For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that
|
||||
// the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may
|
||||
// be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0).
|
||||
// In the bi-allelic case (where there are no other alternate alleles over which to marginalize),
|
||||
// the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED.
|
||||
final double[][] log10AlleleFrequencyLikelihoods;
|
||||
final double[][] log10AlleleFrequencyPosteriors;
|
||||
|
||||
// These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
|
||||
double log10LikelihoodOfAFzero = 0.0;
|
||||
double log10PosteriorOfAFzero = 0.0;
|
||||
|
||||
public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
|
||||
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
|
||||
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,94 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
public class BiallelicGenotypeLikelihoods {
|
||||
|
||||
private String sample;
|
||||
private double[] GLs;
|
||||
private Allele A, B;
|
||||
private int depth;
|
||||
|
||||
/**
|
||||
* Create a new object for sample with given alleles and genotype likelihoods
|
||||
*
|
||||
* @param sample sample name
|
||||
* @param A allele A
|
||||
* @param B allele B
|
||||
* @param log10AALikelihoods AA likelihoods
|
||||
* @param log10ABLikelihoods AB likelihoods
|
||||
* @param log10BBLikelihoods BB likelihoods
|
||||
* @param depth the read depth used in creating the likelihoods
|
||||
*/
|
||||
public BiallelicGenotypeLikelihoods(String sample,
|
||||
Allele A,
|
||||
Allele B,
|
||||
double log10AALikelihoods,
|
||||
double log10ABLikelihoods,
|
||||
double log10BBLikelihoods,
|
||||
int depth) {
|
||||
this.sample = sample;
|
||||
this.A = A;
|
||||
this.B = B;
|
||||
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||
this.depth = depth;
|
||||
}
|
||||
|
||||
public String getSample() {
|
||||
return sample;
|
||||
}
|
||||
|
||||
public double getAALikelihoods() {
|
||||
return GLs[0];
|
||||
}
|
||||
|
||||
public double getABLikelihoods() {
|
||||
return GLs[1];
|
||||
}
|
||||
|
||||
public double getBBLikelihoods() {
|
||||
return GLs[2];
|
||||
}
|
||||
|
||||
public double[] getLikelihoods() {
|
||||
return GLs;
|
||||
}
|
||||
|
||||
public Allele getAlleleA() {
|
||||
return A;
|
||||
}
|
||||
|
||||
public Allele getAlleleB() {
|
||||
return B;
|
||||
}
|
||||
|
||||
public int getDepth() {
|
||||
return depth;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -27,13 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Aug 4, 2009
|
||||
* Time: 6:46:09 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public enum DiploidGenotype {
|
||||
AA ('A', 'A'),
|
||||
AC ('A', 'C'),
|
||||
|
|
@ -110,6 +103,20 @@ public enum DiploidGenotype {
|
|||
return conversionMatrix[index1][index2];
|
||||
}
|
||||
|
||||
/**
|
||||
* create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly
|
||||
* @param baseIndex1 base1
|
||||
* @param baseIndex2 base2
|
||||
* @return the diploid genotype
|
||||
*/
|
||||
public static DiploidGenotype createDiploidGenotype(int baseIndex1, int baseIndex2) {
|
||||
if ( baseIndex1 == -1 )
|
||||
throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character");
|
||||
if ( baseIndex2 == -1 )
|
||||
throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character");
|
||||
return conversionMatrix[baseIndex1][baseIndex2];
|
||||
}
|
||||
|
||||
private static final DiploidGenotype[][] conversionMatrix = {
|
||||
{ DiploidGenotype.AA, DiploidGenotype.AC, DiploidGenotype.AG, DiploidGenotype.AT },
|
||||
{ DiploidGenotype.AC, DiploidGenotype.CC, DiploidGenotype.CG, DiploidGenotype.CT },
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -275,19 +274,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
|
||||
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
|
||||
if ( elt.isReducedRead() ) {
|
||||
// reduced read representation
|
||||
byte qual = elt.getQual();
|
||||
if ( BaseUtils.isRegularBase( elt.getBase() )) {
|
||||
if ( BaseUtils.isRegularBase( obsBase )) {
|
||||
add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods
|
||||
return elt.getRepresentativeCount(); // we added nObs bases here
|
||||
} else // odd bases or deletions => don't use them
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
}
|
||||
|
||||
// odd bases or deletions => don't use them
|
||||
return 0;
|
||||
}
|
||||
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0;
|
||||
}
|
||||
|
||||
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
|
|
@ -511,20 +511,19 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
* @return
|
||||
*/
|
||||
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = p.getQual();
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
byte qual = p.getQual();
|
||||
|
||||
return qual;
|
||||
}
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
return qual;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -26,84 +26,43 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
//
|
||||
// code for testing purposes
|
||||
//
|
||||
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
public void getLog10PNonRef(final GenotypesContext GLs,
|
||||
final List<Allele> alleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final int numAlleles = alleles.size();
|
||||
final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null;
|
||||
final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null;
|
||||
|
||||
int idxDiag = numAlleles;
|
||||
int incr = numAlleles - 1;
|
||||
for (int k=1; k < numAlleles; k++) {
|
||||
// multi-allelic approximation, part 1: Ideally
|
||||
// for each alt allele compute marginal (suboptimal) posteriors -
|
||||
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
|
||||
// as a row-wise upper triangular matrix of likelihoods.
|
||||
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
|
||||
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
|
||||
|
||||
final int idxAA = 0;
|
||||
final int idxAB = k;
|
||||
// yy is always element on the diagonal.
|
||||
// 2 alleles: BBelement 2
|
||||
// 3 alleles: BB element 3. CC element 5
|
||||
// 4 alleles:
|
||||
final int idxBB = idxDiag;
|
||||
idxDiag += incr--;
|
||||
|
||||
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
|
||||
if (numAlleles > 2) {
|
||||
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
|
||||
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
|
||||
}
|
||||
}
|
||||
|
||||
if (numAlleles > 2) {
|
||||
// multiallelic approximation, part 2:
|
||||
// report posteriors for allele that has highest estimated AC
|
||||
int mostLikelyAlleleIdx = MathUtils.maxElementIndex(bestAFguess);
|
||||
for (int k=0; k < log10AlleleFrequencyPosteriors.length-1; k++)
|
||||
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
|
||||
|
||||
}
|
||||
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false);
|
||||
}
|
||||
|
||||
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
||||
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
|
||||
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.values() ) {
|
||||
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
|
||||
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
|
||||
genotypeLikelihoods.add(gls);
|
||||
}
|
||||
}
|
||||
|
|
@ -112,9 +71,397 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
|
||||
|
||||
final static double approximateLog10SumLog10(double[] vals) {
|
||||
if ( vals.length < 2 )
|
||||
throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10");
|
||||
|
||||
double approx = approximateLog10SumLog10(vals[0], vals[1]);
|
||||
for ( int i = 2; i < vals.length; i++ )
|
||||
approx = approximateLog10SumLog10(approx, vals[i]);
|
||||
return approx;
|
||||
}
|
||||
|
||||
final static double approximateLog10SumLog10(double small, double big) {
|
||||
// make sure small is really the smaller value
|
||||
if ( small > big ) {
|
||||
final double t = big;
|
||||
big = small;
|
||||
small = t;
|
||||
}
|
||||
|
||||
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
|
||||
return big;
|
||||
|
||||
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
|
||||
return big;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
return big + MathUtils.jacobianLogTable[ind];
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Linearized, ~O(N), implementation.
|
||||
// Multi-allelic implementation.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
private static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
|
||||
|
||||
// a wrapper around the int array so that we can make it hashable
|
||||
private static final class ExactACcounts {
|
||||
|
||||
private final int[] counts;
|
||||
private int hashcode = -1;
|
||||
|
||||
public ExactACcounts(final int[] counts) {
|
||||
this.counts = counts;
|
||||
}
|
||||
|
||||
public int[] getCounts() {
|
||||
return counts;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof ExactACcounts) ? Arrays.equals(counts, ((ExactACcounts)obj).counts) : false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
if ( hashcode == -1 )
|
||||
hashcode = Arrays.hashCode(counts);
|
||||
return hashcode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(counts[0]);
|
||||
for ( int i = 1; i < counts.length; i++ ) {
|
||||
sb.append("/");
|
||||
sb.append(counts[i]);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
// This class represents a column in the Exact AC calculation matrix
|
||||
private static final class ExactACset {
|
||||
|
||||
// the counts of the various alternate alleles which this column represents
|
||||
final ExactACcounts ACcounts;
|
||||
|
||||
// the column of the matrix
|
||||
final double[] log10Likelihoods;
|
||||
|
||||
// mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column;
|
||||
// for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB.
|
||||
final HashMap<ExactACcounts, Integer> ACsetIndexToPLIndex = new HashMap<ExactACcounts, Integer>();
|
||||
|
||||
// to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them
|
||||
final ArrayList<ExactACcounts> dependentACsetsToDelete = new ArrayList<ExactACcounts>();
|
||||
|
||||
|
||||
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
||||
this.ACcounts = ACcounts;
|
||||
log10Likelihoods = new double[size];
|
||||
}
|
||||
|
||||
// sum of all the non-reference alleles
|
||||
public int getACsum() {
|
||||
int sum = 0;
|
||||
for ( int count : ACcounts.getCounts() )
|
||||
sum += count;
|
||||
return sum;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof ExactACset) ? ACcounts.equals(((ExactACset)obj).ACcounts) : false;
|
||||
}
|
||||
}
|
||||
|
||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final boolean preserveData) {
|
||||
|
||||
// make sure the PL cache has been initialized
|
||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
|
||||
UnifiedGenotyperEngine.calculatePLcache(5);
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
||||
// add AC=0 to the queue
|
||||
int[] zeroCounts = new int[numAlternateAlleles];
|
||||
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
}
|
||||
}
|
||||
|
||||
private static double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double maxLog10L,
|
||||
final int numChr,
|
||||
final boolean preserveData,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// clean up memory
|
||||
if ( !preserveData ) {
|
||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
||||
indexesToACset.put(index, null);
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
|
||||
// no reason to keep this data around because nothing depends on it
|
||||
if ( !preserveData )
|
||||
indexesToACset.put(set.ACcounts, null);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
final int ACwiggle = numChr - set.getACsum();
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
|
||||
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
|
||||
|
||||
// add conformations for the k+1 case
|
||||
int PLindex = 0;
|
||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
|
||||
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
|
||||
if ( !preserveData && lastSet == null ) {
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
|
||||
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||
}
|
||||
if ( lastSet != null )
|
||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also adds it as a dependency to the given callingSetIndex.
|
||||
// returns the ExactACset if that set was not already in the queue and null otherwise.
|
||||
private static ExactACset updateACset(final int[] ACcounts,
|
||||
final int numChr,
|
||||
final ExactACset callingSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
final ExactACcounts index = new ExactACcounts(ACcounts);
|
||||
boolean wasInQueue = true;
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
wasInQueue = false;
|
||||
}
|
||||
|
||||
// add the given dependency to the set
|
||||
final ExactACset set = indexesToACset.get(index);
|
||||
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
|
||||
return wasInQueue ? null : set;
|
||||
}
|
||||
|
||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue<ExactACset> ACqueue) {
|
||||
ExactACset set = null;
|
||||
for ( ExactACset queued : ACqueue ) {
|
||||
if ( queued.dependentACsetsToDelete.contains(callingSetIndex) )
|
||||
set = queued;
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalK == 0 ) {
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
|
||||
}
|
||||
// k > 0 for at least one k
|
||||
else {
|
||||
// all possible likelihoods for a given cell from which to choose the max
|
||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
||||
final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it
|
||||
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
// initialize
|
||||
for ( int i = 0; i < numPaths; i++ )
|
||||
// TODO -- Arrays.fill?
|
||||
// todo -- is this even necessary? Why not have as else below?
|
||||
log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// deal with the AA case first
|
||||
if ( totalK < 2*j-1 )
|
||||
log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
|
||||
// deal with the other possible conformations now
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
int conformationIndex = 1;
|
||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||
log10ConformationLikelihoods[conformationIndex++] =
|
||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||
}
|
||||
}
|
||||
|
||||
final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
|
||||
|
||||
// finally, update the L(j,k) value
|
||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
||||
}
|
||||
}
|
||||
|
||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// determine the power of theta to use
|
||||
int nonRefAlleles = 0;
|
||||
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
|
||||
if ( set.ACcounts.getCounts()[i] > 0 )
|
||||
nonRefAlleles++;
|
||||
}
|
||||
|
||||
// for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object
|
||||
if ( nonRefAlleles == 0 ) {
|
||||
result.log10LikelihoodOfAFzero = log10LofK;
|
||||
result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0];
|
||||
} else {
|
||||
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
|
||||
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
|
||||
int AC = set.ACcounts.getCounts()[i];
|
||||
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
|
||||
final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
|
||||
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
// AA: (2j - totalK) * (2j - totalK - 1)
|
||||
// AB: 2k_b * (2j - totalK)
|
||||
// AC: 2k_c * (2j - totalK)
|
||||
// BB: k_b * (k_b - 1)
|
||||
// BC: 2 * k_b * k_c
|
||||
// CC: k_c * (k_c - 1)
|
||||
|
||||
final int numAltAlleles = ACcounts.length;
|
||||
|
||||
// the AX het case
|
||||
if ( PLindex <= numAltAlleles )
|
||||
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
// find the 2 alternate alleles that are represented by this PL index
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
|
||||
|
||||
final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( alleles[0] == alleles[1] ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[alleles[1]-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
return coeff;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
|
|
@ -122,6 +469,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
|
||||
* for the exact model calculation
|
||||
*/
|
||||
/*
|
||||
private final static class ExactACCache {
|
||||
double[] kMinus2, kMinus1, kMinus0;
|
||||
|
||||
|
|
@ -155,9 +503,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
public int linearExact(Map<String, Genotype> GLs,
|
||||
public int linearExact(GenotypesContext GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
double[][] log10AlleleFrequencyLikelihoods,
|
||||
double[][] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
|
@ -174,7 +523,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
|
|
@ -187,14 +536,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
double aa = Double.NEGATIVE_INFINITY;
|
||||
double ab = Double.NEGATIVE_INFINITY;
|
||||
if (k < 2*j-1)
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[idxAA];
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
|
||||
|
||||
if (k < 2*j)
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[idxAB];
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
|
||||
|
||||
double log10Max;
|
||||
if (k > 1) {
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[idxBB];
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
|
||||
log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
} else {
|
||||
// we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
|
|
@ -208,7 +557,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
// update the posteriors vector
|
||||
final double log10LofK = kMinus0[numSamples];
|
||||
log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
|
||||
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
|
||||
// can we abort early?
|
||||
lastK = k;
|
||||
|
|
@ -225,229 +575,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
|
||||
final static double approximateLog10SumLog10(double a, double b, double c) {
|
||||
//return softMax(new double[]{a, b, c});
|
||||
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
|
||||
}
|
||||
*/
|
||||
|
||||
final static double approximateLog10SumLog10(double small, double big) {
|
||||
// make sure small is really the smaller value
|
||||
if ( small > big ) {
|
||||
final double t = big;
|
||||
big = small;
|
||||
small = t;
|
||||
}
|
||||
|
||||
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
|
||||
return big;
|
||||
|
||||
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
|
||||
return big;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
// we need to compute log10(10^x + 10^y)
|
||||
// By Jacobian logarithm identity, this is equal to
|
||||
// max(x,y) + log10(1+10^-abs(x-y))
|
||||
// we compute the second term as a table lookup
|
||||
// with integer quantization
|
||||
// we have pre-stored correction for 0,0.1,0.2,... 10.0
|
||||
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
|
||||
|
||||
//double z =Math.log10(1+Math.pow(10.0,-diff));
|
||||
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
|
||||
return big + MathUtils.jacobianLogTable[ind];
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
public Map<String, Genotype> assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
if ( !vc.isVariant() )
|
||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
|
||||
Map<String, Genotype> GLs = vc.getGenotypes();
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
|
||||
ArrayList<String> sampleIndices = new ArrayList<String>();
|
||||
int sampleIdx = 0;
|
||||
|
||||
// todo - optimize initialization
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++)
|
||||
for (int j=0; j <= GLs.size(); j++)
|
||||
pathMetricArray[j][k] = -1e30;
|
||||
|
||||
pathMetricArray[0][0] = 0.0;
|
||||
|
||||
// todo = can't deal with optimal dynamic programming solution with multiallelic records
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
sampleIndices.addAll(GLs.keySet());
|
||||
sampleIdx = GLs.size();
|
||||
}
|
||||
else {
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
|
||||
//System.out.print(sample.getKey()+":");
|
||||
//for (int k=0; k < likelihoods.length; k++)
|
||||
// System.out.format("%4.2f ",likelihoods[k]);
|
||||
//System.out.println();
|
||||
// all likelihoods are essentially the same: skip this sample and will later on force no call.
|
||||
//sampleIdx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
sampleIndices.add(sample.getKey());
|
||||
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||
|
||||
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
|
||||
int bestIndex = k;
|
||||
|
||||
if (k>0) {
|
||||
double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1];
|
||||
if (m2 > bestMetric) {
|
||||
bestMetric = m2;
|
||||
bestIndex = k-1;
|
||||
}
|
||||
}
|
||||
|
||||
if (k>1) {
|
||||
double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2];
|
||||
if (m2 > bestMetric) {
|
||||
bestMetric = m2;
|
||||
bestIndex = k-2;
|
||||
}
|
||||
}
|
||||
|
||||
pathMetricArray[sampleIdx+1][k] = bestMetric;
|
||||
tracebackArray[sampleIdx+1][k] = bestIndex;
|
||||
}
|
||||
sampleIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
int startIdx = AFofMaxLikelihood;
|
||||
for (int k = sampleIdx; k > 0; k--) {
|
||||
int bestGTguess;
|
||||
String sample = sampleIndices.get(k-1);
|
||||
Genotype g = GLs.get(sample);
|
||||
if ( !g.hasLikelihoods() )
|
||||
continue;
|
||||
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
|
||||
// and will add no-call genotype to GL's in a second pass
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Double.NEGATIVE_INFINITY;
|
||||
double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
|
||||
}
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];;
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
/* System.out.format("Sample: %s GL:",sample);
|
||||
for (int i=0; i < likelihoods.length; i++)
|
||||
System.out.format("%1.4f, ",likelihoods[i]);
|
||||
*/
|
||||
|
||||
for (int i=0; i < likelihoods.length; i++) {
|
||||
if (i==bestGTguess)
|
||||
continue;
|
||||
if (likelihoods[i] >= qual)
|
||||
qual = likelihoods[i];
|
||||
}
|
||||
// qual contains now max(likelihoods[k]) for all k != bestGTguess
|
||||
qual = likelihoods[bestGTguess] - qual;
|
||||
|
||||
// likelihoods are stored row-wise in lower triangular matrix. IE
|
||||
// for 2 alleles they have ordering AA,AB,BB
|
||||
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
|
||||
// Get now alleles corresponding to best index
|
||||
int kk=0;
|
||||
boolean done = false;
|
||||
for (int j=0; j < vc.getNAlleles(); j++) {
|
||||
for (int i=0; i <= j; i++){
|
||||
if (kk++ == bestGTguess) {
|
||||
if (i==0)
|
||||
myAlleles.add(vc.getReference());
|
||||
else
|
||||
myAlleles.add(vc.getAlternateAllele(i-1));
|
||||
|
||||
if (j==0)
|
||||
myAlleles.add(vc.getReference());
|
||||
else
|
||||
myAlleles.add(vc.getAlternateAllele(j-1));
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
if (done)
|
||||
break;
|
||||
}
|
||||
|
||||
if (qual < 0) {
|
||||
// QUAL can be negative if the chosen genotype is not the most likely one individually.
|
||||
// In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on
|
||||
double[] normalized = MathUtils.normalizeFromLog10(likelihoods);
|
||||
double chosenGenotype = normalized[bestGTguess];
|
||||
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
||||
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
continue;
|
||||
Genotype g = GLs.get(sample.getKey());
|
||||
|
||||
double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
|
||||
continue; // regular likelihoods
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
double qual = Genotype.NO_NEG_LOG_10PERROR;
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
//System.out.println(myAlleles.toString());
|
||||
calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false));
|
||||
}
|
||||
return calls;
|
||||
}
|
||||
|
||||
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
|
||||
int j = logYMatrix.length - 1;
|
||||
System.out.printf("-----------------------------------%n");
|
||||
for (int k=0; k <= numChr; k++) {
|
||||
double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
|
||||
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -81,25 +80,23 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param GLs hash of sample->GL to fill in
|
||||
* @param alternateAlleleToUse the alternate allele to use, null if not set
|
||||
*
|
||||
* @param useBAQedPileup
|
||||
* @return genotype likelihoods per sample for AA, AB, BB
|
||||
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
|
||||
* @return variant context where genotypes are no-called but with GLs
|
||||
*/
|
||||
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, MultiallelicGenotypeLikelihoods> GLs,
|
||||
Allele alternateAlleleToUse, boolean useBAQedPileup);
|
||||
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup);
|
||||
|
||||
protected int getFilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( BaseUtils.isRegularBase( p.getBase() ) )
|
||||
count++;
|
||||
count += p.getRepresentativeCount();
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
|
|||
|
|
@ -1,271 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
||||
|
||||
// for use in optimizing the P(D|AF) calculations:
|
||||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||
|
||||
private AlleleFrequencyMatrix AFMatrix;
|
||||
|
||||
protected GridSearchAFEstimation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||
}
|
||||
|
||||
protected void getLog10PNonRef(Map<String, Genotype> GLs, List<Allele> alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
initializeAFMatrix(GLs);
|
||||
|
||||
// first, calculate for AF=0 (no change to matrix)
|
||||
log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0];
|
||||
double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0];
|
||||
|
||||
int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2;
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 1; i <= maxAlleleFrequencyToTest; i++) {
|
||||
// add one more alternate allele
|
||||
AFMatrix.incrementFrequency();
|
||||
|
||||
// calculate new likelihoods
|
||||
log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i];
|
||||
|
||||
// an optimization to speed up the calculation: if we are beyond the local maximum such
|
||||
// that subsequent likelihoods won't factor into the confidence score, just quit
|
||||
if ( maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON )
|
||||
return;
|
||||
|
||||
if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen )
|
||||
maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the super class
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
protected Map<String, Genotype> assignGenotypes(VariantContext vc,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
if ( !vc.isVariant() )
|
||||
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
|
||||
|
||||
Allele refAllele = vc.getReference();
|
||||
Allele altAllele = vc.getAlternateAllele(0);
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
// first, the potential alt calls
|
||||
for ( String sample : AFMatrix.getSamples() ) {
|
||||
Genotype g = vc.getGenotype(sample);
|
||||
|
||||
// set the genotype and confidence
|
||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample);
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||
myAlleles.add(refAllele);
|
||||
myAlleles.add(refAllele);
|
||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||
myAlleles.add(refAllele);
|
||||
myAlleles.add(altAllele);
|
||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||
myAlleles.add(altAllele);
|
||||
myAlleles.add(altAllele);
|
||||
}
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, g.getAttributes(), false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
private void initializeAFMatrix(Map<String, Genotype> GLs) {
|
||||
AFMatrix.clear();
|
||||
|
||||
for ( Genotype g : GLs.values() ) {
|
||||
if ( g.hasLikelihoods() )
|
||||
AFMatrix.setLikelihoods(g.getLikelihoods().getAsVector(), g.getSampleName());
|
||||
}
|
||||
}
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
|
||||
private double[][] matrix; // allele frequency matrix
|
||||
private int[] indexes; // matrix to maintain which genotype is active
|
||||
private int maxN; // total possible frequencies in data
|
||||
private int frequency; // current frequency
|
||||
|
||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||
private ArrayList<String> samples = new ArrayList<String>();
|
||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||
|
||||
public AlleleFrequencyMatrix(int N) {
|
||||
maxN = N;
|
||||
matrix = new double[N][3];
|
||||
indexes = new int[N];
|
||||
clear();
|
||||
}
|
||||
|
||||
public List<String> getSamples() { return samples; }
|
||||
|
||||
public void clear() {
|
||||
frequency = 0;
|
||||
for (int i = 0; i < maxN; i++)
|
||||
indexes[i] = 0;
|
||||
samples.clear();
|
||||
samplesToGenotypesPerAF.clear();
|
||||
}
|
||||
|
||||
public void setLikelihoods(double[] GLs, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||
}
|
||||
|
||||
public void incrementFrequency() {
|
||||
int N = samples.size();
|
||||
if ( frequency == 2 * N )
|
||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||
frequency++;
|
||||
|
||||
double greedy = VALUE_NOT_CALCULATED;
|
||||
int greedyIndex = -1;
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
}
|
||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
// note that we currently don't bother with breaking ties between samples
|
||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||
// unlikely that a collision will both occur and that the difference will
|
||||
// be significant at HOM_VAR...
|
||||
}
|
||||
// if this person is already hom var, he can't add another alternate allele
|
||||
// so we can ignore that case
|
||||
}
|
||||
if ( greedyIndex == -1 )
|
||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||
|
||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||
else
|
||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||
}
|
||||
|
||||
public double getLikelihoodsOfFrequency() {
|
||||
double likelihoods = 0.0;
|
||||
int N = samples.size();
|
||||
for (int i = 0; i < N; i++)
|
||||
likelihoods += matrix[i][indexes[i]];
|
||||
|
||||
/*
|
||||
System.out.println(frequency);
|
||||
for (int i = 0; i < N; i++) {
|
||||
System.out.print(samples.get(i));
|
||||
for (int j=0; j < 3; j++) {
|
||||
System.out.print(String.valueOf(matrix[i][j]));
|
||||
System.out.print(indexes[i] == j ? "* " : " ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println(likelihoods);
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
recordGenotypes();
|
||||
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||
}
|
||||
|
||||
private void recordGenotypes() {
|
||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||
|
||||
int index = 0;
|
||||
for ( String sample : samples ) {
|
||||
int genotype = indexes[index];
|
||||
|
||||
double score;
|
||||
|
||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||
if ( genotype == maxEntry ) {
|
||||
if ( genotype == GenotypeType.AA.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||
}
|
||||
// otherwise, we need to calculate the probability of the genotype
|
||||
else {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||
double chosenGenotype = normalized[genotype];
|
||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
|
||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||
index++;
|
||||
}
|
||||
|
||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -34,6 +34,8 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -41,8 +43,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -125,7 +126,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) {
|
||||
//SAMRecord read = p.getRead();
|
||||
GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
if (read == null)
|
||||
continue;
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
|
|
@ -243,7 +244,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
// get deletion length
|
||||
int dLen = Integer.valueOf(bestAltAllele.substring(1));
|
||||
// get ref bases of accurate deletion
|
||||
int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart());
|
||||
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
|
||||
|
||||
//System.out.println(new String(ref.getBases()));
|
||||
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
|
||||
|
|
@ -270,19 +271,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
|
||||
|
||||
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, MultiallelicGenotypeLikelihoods> GLs,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
AlignmentContextUtils.ReadOrientation contextType,
|
||||
GenotypePriors priors,
|
||||
Allele alternateAlleleToUse,
|
||||
boolean useBAQedPileup) {
|
||||
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
|
||||
GenomeLoc loc = ref.getLocus();
|
||||
Allele refAllele, altAllele;
|
||||
VariantContext vc = null;
|
||||
|
|
@ -368,10 +367,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
|
||||
ref, hsize, numPrefBases);
|
||||
|
||||
// start making the VariantContext
|
||||
final int endLoc = calculateEndPos(alleleList, refAllele, loc);
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase());
|
||||
|
||||
// create the genotypes; no-call everyone for now
|
||||
GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
// For each sample, get genotype likelihoods based on pileup
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
// initialize the GenotypeLikelihoods
|
||||
GLs.clear();
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
|
||||
|
|
@ -384,11 +390,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
|
||||
if (pileup != null ) {
|
||||
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
|
||||
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
|
||||
|
||||
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
alleleList,
|
||||
genotypeLikelihoods,
|
||||
getFilteredDepth(pileup)));
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
|
||||
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString());
|
||||
|
|
@ -399,9 +406,25 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
}
|
||||
|
||||
return refAllele;
|
||||
return builder.genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||
// for indels, stop location is one more than ref allele length
|
||||
boolean hasNullAltAllele = false;
|
||||
for ( Allele a : alleles ) {
|
||||
if ( a.isNull() ) {
|
||||
hasNullAltAllele = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int endLoc = loc.getStart() + refAllele.length();
|
||||
if( !hasNullAltAllele )
|
||||
endLoc--;
|
||||
|
||||
return endLoc;
|
||||
}
|
||||
|
||||
public static HashMap<PileupElement,LinkedHashMap<Allele,Double>> getIndelLikelihoodMap() {
|
||||
return indelLikelihoodMap.get();
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue