Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
b02ef95bcf
|
|
@ -613,7 +613,7 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
|
||||
|
||||
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
|
||||
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
|
||||
for ( IntervalBinding intervalBinding : argList ) {
|
||||
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
|
||||
|
||||
|
|
|
|||
|
|
@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
|
|||
|
||||
public boolean filterOut(final SAMRecord rec) {
|
||||
Cigar c = rec.getCigar();
|
||||
boolean lastElementWasIndel = false;
|
||||
for ( CigarElement ce : c.getCigarElements() ) {
|
||||
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
|
||||
if ( lastElementWasIndel )
|
||||
return true;
|
||||
lastElementWasIndel = true;
|
||||
} else {
|
||||
lastElementWasIndel = false;
|
||||
boolean previousElementWasIndel = false;
|
||||
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||
|
||||
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
|
||||
return true;
|
||||
|
||||
for (CigarElement ce : c.getCigarElements()) {
|
||||
CigarOperator op = ce.getOperator();
|
||||
if (op == CigarOperator.D || op == CigarOperator.I) {
|
||||
if (previousElementWasIndel)
|
||||
return true; // filter out reads with adjacent I/D
|
||||
|
||||
previousElementWasIndel = true;
|
||||
}
|
||||
else // this is a regular base (match/mismatch/hard or soft clip)
|
||||
previousElementWasIndel = false; // reset the previous element
|
||||
|
||||
lastOp = op;
|
||||
}
|
||||
|
||||
return false;
|
||||
return lastOp == CigarOperator.D;
|
||||
}
|
||||
}
|
||||
|
|
@ -179,6 +179,11 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
|
||||
}
|
||||
|
||||
public CigarElement peekBackwardOnGenome() {
|
||||
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
|
||||
}
|
||||
|
||||
|
||||
public CigarOperator stepForwardOnGenome() {
|
||||
// we enter this method with readOffset = index of the last processed base on the read
|
||||
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
|
||||
|
|
@ -194,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return stepForwardOnGenome();
|
||||
} else {
|
||||
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
|
||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||
|
||||
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
||||
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
||||
|
|
@ -231,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
||||
// past the insertion right after that
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
||||
eventLength = curElement.getLength();
|
||||
eventStart = readOffset;
|
||||
|
|
@ -244,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
break;
|
||||
case D: // deletion w.r.t. the reference
|
||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
|
||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||
if (generateExtendedEvents) {
|
||||
if (cigarElementCounter == 1) {
|
||||
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
||||
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||
eventLength = curElement.getLength();
|
||||
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
||||
eventStart = readOffset;
|
||||
|
|
@ -401,24 +406,24 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next();
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
|
||||
final int eventLength = state.getEventLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
|
||||
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
|
||||
size++;
|
||||
ExtendedEventPileupElement pileupElement;
|
||||
if (state.getEventBases() == null) { // Deletion event
|
||||
if (state.getEventBases() == null) { // Deletion event
|
||||
nDeletions++;
|
||||
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
|
||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
|
||||
}
|
||||
else { // Insertion event
|
||||
else { // Insertion event
|
||||
nInsertions++;
|
||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
|
||||
}
|
||||
|
|
@ -442,10 +447,10 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
if (indelPile.size() != 0)
|
||||
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
|
||||
}
|
||||
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
||||
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
||||
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
|
||||
}
|
||||
else { // this is a regular event pileup (not extended)
|
||||
else { // this is a regular event pileup (not extended)
|
||||
GenomeLoc location = getLocation();
|
||||
Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
boolean hasBeenSampled = false;
|
||||
|
|
@ -454,27 +459,34 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
size = 0; // number of elements in this sample's pileup
|
||||
nDeletions = 0; // number of deletions in this sample's pileup
|
||||
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||
size = 0; // number of elements in this sample's pileup
|
||||
nDeletions = 0; // number of deletions in this sample's pileup
|
||||
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||
final CigarOperator nextOp = nextElement.getOperator();
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
|
||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
|
||||
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
|
||||
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
|
||||
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
|
||||
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
|
||||
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
|
||||
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
|
||||
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
|
||||
|
||||
int nextElementLength = nextElement.getLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (op == CigarOperator.D) {
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
|
||||
null,nextOp == CigarOperator.D? nextElementLength:-1));
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
|
||||
size++;
|
||||
nDeletions++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
|
|
@ -484,11 +496,10 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
else {
|
||||
if (!filterBaseInRead(read, location.getStart())) {
|
||||
String insertedBaseString = null;
|
||||
if (nextOp == CigarOperator.I) {
|
||||
if (nextOp == CigarOperator.I)
|
||||
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
|
||||
}
|
||||
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
|
||||
insertedBaseString,nextElementLength));
|
||||
|
||||
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
|
||||
size++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMFormatException;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -23,7 +22,7 @@ public class MalformedBAMErrorReformatingIterator implements CloseableIterator<S
|
|||
public boolean hasNext() {
|
||||
try {
|
||||
return this.it.hasNext();
|
||||
} catch ( SAMFormatException e ) {
|
||||
} catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes
|
||||
throw new UserException.MalformedBAM(source, e.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
@ -31,7 +30,7 @@ public class MalformedBAMErrorReformatingIterator implements CloseableIterator<S
|
|||
public SAMRecord next() {
|
||||
try {
|
||||
return it.next();
|
||||
} catch ( SAMFormatException e ) {
|
||||
} catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes
|
||||
throw new UserException.MalformedBAM(source, e.getMessage());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -418,7 +418,7 @@ public class RefMetaDataTracker {
|
|||
* with the current site as a RODRecordList List object. If no data track with specified name is available,
|
||||
* returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up
|
||||
* with track name set to 'name' and location set to null; otherwise the wrapper object will have name and
|
||||
* location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution,
|
||||
* location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution,
|
||||
* defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise:
|
||||
* for instance, on locus traversal, location is usually expected to be a single base we are currently looking at,
|
||||
* regardless of the presence of "extended" RODs overlapping with that location).
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ public class FeatureManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Return the FeatureDescriptor with getName().equals(name)
|
||||
* Return the FeatureDescriptor with getID().equals(name)
|
||||
*
|
||||
* @param name
|
||||
* @return A FeatureDescriptor or null if none is found
|
||||
|
|
|
|||
|
|
@ -41,10 +41,10 @@ import java.util.TreeMap;
|
|||
public class GATKReport {
|
||||
public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport.";
|
||||
public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_0;
|
||||
public static final String SEPARATOR = ":";
|
||||
private static final String SEPARATOR = ":";
|
||||
private GATKReportVersion version = LATEST_REPORT_VERSION;
|
||||
|
||||
private TreeMap<String, GATKReportTable> tables = new TreeMap<String, GATKReportTable>();
|
||||
private final TreeMap<String, GATKReportTable> tables = new TreeMap<String, GATKReportTable>();
|
||||
|
||||
/**
|
||||
* Create a new, empty GATKReport.
|
||||
|
|
@ -70,6 +70,15 @@ public class GATKReport {
|
|||
loadReport(file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GATK report from GATK report tables
|
||||
* @param tables Any number of tables that you want ot add to the report
|
||||
*/
|
||||
public GATKReport(GATKReportTable... tables) {
|
||||
for( GATKReportTable table: tables)
|
||||
addTable(table);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a GATKReport file from disk
|
||||
*
|
||||
|
|
@ -202,10 +211,6 @@ public class GATKReport {
|
|||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(GATKReportVersion version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything
|
||||
* in between. This does not check if the data inside is the same. This is the check to see if the two reports are
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
|||
defaultValue.equals(that.defaultValue) );
|
||||
}
|
||||
|
||||
protected boolean equals(GATKReportColumn that) {
|
||||
boolean equals(GATKReportColumn that) {
|
||||
if ( !this.keySet().equals(that.keySet()) ) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -29,8 +29,8 @@ package org.broadinstitute.sting.gatk.report;
|
|||
*/
|
||||
public class GATKReportColumnFormat {
|
||||
public static enum Alignment { LEFT, RIGHT }
|
||||
public int width;
|
||||
public Alignment alignment;
|
||||
private final int width;
|
||||
private final Alignment alignment;
|
||||
|
||||
public GATKReportColumnFormat(int width, Alignment alignment) {
|
||||
this.width = width;
|
||||
|
|
|
|||
|
|
@ -24,13 +24,15 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Tracks a linked list of GATKReportColumn in order by name.
|
||||
*/
|
||||
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
|
||||
private List<String> columnNames = new ArrayList<String>();
|
||||
private final List<String> columnNames = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Returns the column by index
|
||||
|
|
@ -43,9 +45,12 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> i
|
|||
}
|
||||
|
||||
@Override
|
||||
public GATKReportColumn remove(Object key) {
|
||||
columnNames.remove(key);
|
||||
return super.remove(key);
|
||||
public GATKReportColumn remove(Object columnName) {
|
||||
if ( !(columnName instanceof String) ) {
|
||||
throw new ReviewedStingException("The column name must be a String!");
|
||||
}
|
||||
columnNames.remove(columnName.toString());
|
||||
return super.remove(columnName);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -85,7 +90,7 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> i
|
|||
return true;
|
||||
}
|
||||
|
||||
protected boolean equals(GATKReportColumns that) {
|
||||
boolean equals(GATKReportColumns that) {
|
||||
for (Map.Entry<String, GATKReportColumn> pair : entrySet()) {
|
||||
// Make sure that every column is the same, we know that the # of columns
|
||||
// is the same from isSameFormat()
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ public enum GATKReportDataType {
|
|||
*/
|
||||
String("%[Ss]");
|
||||
|
||||
public final String dataTypeString;
|
||||
private final String dataTypeString;
|
||||
|
||||
private GATKReportDataType(String dataTypeString) {
|
||||
this.dataTypeString = dataTypeString;
|
||||
|
|
@ -189,7 +189,7 @@ public enum GATKReportDataType {
|
|||
* @param obj The input string
|
||||
* @return an object that matches the data type.
|
||||
*/
|
||||
protected Object Parse(Object obj) {
|
||||
Object Parse(Object obj) {
|
||||
if (obj instanceof String) {
|
||||
String str = obj.toString();
|
||||
switch (this) {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,27 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
|
|
@ -8,13 +32,6 @@ import java.io.FileNotFoundException;
|
|||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: roger
|
||||
* Date: 1/9/12
|
||||
* Time: 11:17 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class GATKReportGatherer extends Gatherer {
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
|
|
|
|||
|
|
@ -34,97 +34,14 @@ import java.util.*;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A data structure that allows data to be collected over the course of a walker's computation, then have that data
|
||||
* written to a PrintStream such that it's human-readable, AWK-able, and R-friendly (given that you load it using the
|
||||
* GATKReport loader module).
|
||||
* <p/>
|
||||
* The goal of this object is to use the same data structure for both accumulating data during a walker's computation
|
||||
* and emitting that data to a file for easy analysis in R (or any other program/language that can take in a table of
|
||||
* results). Thus, all of the infrastructure below is designed simply to make printing the following as easy as
|
||||
* possible:
|
||||
* <p/>
|
||||
* ##:GATKReport.v0.1 ErrorRatePerCycle : The error rate per sequenced position in the reads
|
||||
* cycle errorrate.61PA8.7 qualavg.61PA8.7
|
||||
* 0 0.007451835696110506 25.474613284804366
|
||||
* 1 0.002362777171937477 29.844949954504095
|
||||
* 2 9.087604507451836E-4 32.87590975254731
|
||||
* 3 5.452562704471102E-4 34.498999090081895
|
||||
* 4 9.087604507451836E-4 35.14831665150137
|
||||
* 5 5.452562704471102E-4 36.07223435225619
|
||||
* 6 5.452562704471102E-4 36.1217248908297
|
||||
* 7 5.452562704471102E-4 36.1910480349345
|
||||
* 8 5.452562704471102E-4 36.00345705967977
|
||||
* <p/>
|
||||
* Here, we have a GATKReport table - a well-formatted, easy to read representation of some tabular data. Every single
|
||||
* table has this same GATKReport.v0.1 header, which permits multiple files from different sources to be cat-ed
|
||||
* together, which makes it very easy to pull tables from different programs into R via a single file.
|
||||
* <p/>
|
||||
* ------------
|
||||
* Definitions:
|
||||
* <p/>
|
||||
* Table info:
|
||||
* The first line, structured as
|
||||
* ##:<report version> <table name> : <table description>
|
||||
* <p/>
|
||||
* Table header:
|
||||
* The second line, specifying a unique name for each column in the table.
|
||||
* <p/>
|
||||
* The first column mentioned in the table header is the "primary key" column - a column that provides the unique
|
||||
* identifier for each row in the table. Once this column is created, any element in the table can be referenced by
|
||||
* the row-column coordinate, i.e. "primary key"-"column name" coordinate.
|
||||
* <p/>
|
||||
* When a column is added to a table, a default value must be specified (usually 0). This is the initial value for
|
||||
* an element in a column. This permits operations like increment() and decrement() to work properly on columns that
|
||||
* are effectively counters for a particular event.
|
||||
* <p/>
|
||||
* Finally, the display property for each column can be set during column creation. This is useful when a given
|
||||
* column stores an intermediate result that will be used later on, perhaps to calculate the value of another column.
|
||||
* In these cases, it's obviously necessary to store the value required for further computation, but it's not
|
||||
* necessary to actually print the intermediate column.
|
||||
* <p/>
|
||||
* Table body:
|
||||
* The values of the table itself.
|
||||
* <p/>
|
||||
* ---------------
|
||||
* Implementation:
|
||||
* <p/>
|
||||
* The implementation of this table has two components:
|
||||
* 1. A TreeSet<Object> that stores all the values ever specified for the primary key. Any get() operation that
|
||||
* refers to an element where the primary key object does not exist will result in its implicit creation. I
|
||||
* haven't yet decided if this is a good idea...
|
||||
* <p/>
|
||||
* 2. A HashMap<String, GATKReportColumn> that stores a mapping from column name to column contents. Each
|
||||
* GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap<Object, Object>) between
|
||||
* primary key and the column value. This means that, given N columns, the primary key information is stored
|
||||
* N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations.
|
||||
* <p/>
|
||||
* ------------------------------
|
||||
* Element and column operations:
|
||||
* <p/>
|
||||
* In addition to simply getting and setting values, this object also permits some simple operations to be applied to
|
||||
* individual elements or to whole columns. For instance, an element can be easily incremented without the hassle of
|
||||
* calling get(), incrementing the obtained value by 1, and then calling set() with the new value. Also, some vector
|
||||
* operations are supported. For instance, two whole columns can be divided and have the result be set to a third
|
||||
* column. This is especially useful when aggregating counts in two intermediate columns that will eventually need to
|
||||
* be manipulated row-by-row to compute the final column.
|
||||
* <p/>
|
||||
* Note: I've made no attempt whatsoever to make these operations efficient. Right now, some of the methods check the
|
||||
* type of the stored object using an instanceof call and attempt to do the right thing. Others cast the contents of
|
||||
* the cell to a Number, call the Number.toDouble() method and compute a result. This is clearly not the ideal design,
|
||||
* but at least the prototype contained herein works.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
* @author Khalid Shakir
|
||||
*/
|
||||
public class GATKReportTable {
|
||||
/**
|
||||
* REGEX that matches any table with an invalid name
|
||||
*/
|
||||
public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
|
||||
public static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable";
|
||||
public static final String SEPARATOR = ":";
|
||||
public static final String ENDLINE = ":;";
|
||||
private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable";
|
||||
private static final String SEPARATOR = ":";
|
||||
private static final String ENDLINE = ":;";
|
||||
|
||||
private String tableName;
|
||||
private String tableDescription;
|
||||
|
|
@ -418,8 +335,8 @@ public class GATKReportTable {
|
|||
* output file), and the format string used to display the data.
|
||||
*
|
||||
* @param columnName the name of the column
|
||||
* @param defaultValue if true - the column will be displayed; if false - the column will be hidden
|
||||
* @param display
|
||||
* @param defaultValue the default value of a blank cell
|
||||
* @param display if true - the column will be displayed; if false - the column will be hidden
|
||||
* @param format the format string used to display data
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {
|
||||
|
|
@ -429,12 +346,6 @@ public class GATKReportTable {
|
|||
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format));
|
||||
}
|
||||
|
||||
|
||||
public GATKReportVersion getVersion() {
|
||||
return GATKReport.LATEST_REPORT_VERSION;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Check if the requested element exists, and if not, create it.
|
||||
*
|
||||
|
|
@ -508,8 +419,7 @@ public class GATKReportTable {
|
|||
value = newValue;
|
||||
|
||||
if (column.getDataType().equals(GATKReportDataType.fromObject(value)) ||
|
||||
column.getDataType().equals(GATKReportDataType.Unknown) ||
|
||||
value == null)
|
||||
column.getDataType().equals(GATKReportDataType.Unknown) )
|
||||
columns.get(columnName).put(primaryKey, value);
|
||||
else
|
||||
throw new ReviewedStingException(String.format("Tried to add an object of type: %s to a column of type: %s",
|
||||
|
|
@ -795,7 +705,7 @@ public class GATKReportTable {
|
|||
*
|
||||
* @return the width of the primary key column
|
||||
*/
|
||||
public int getPrimaryKeyColumnWidth() {
|
||||
int getPrimaryKeyColumnWidth() {
|
||||
int maxWidth = getPrimaryKeyName().length();
|
||||
|
||||
for (Object primaryKey : primaryKeyColumn) {
|
||||
|
|
@ -814,7 +724,7 @@ public class GATKReportTable {
|
|||
*
|
||||
* @param out the PrintStream to which the table should be written
|
||||
*/
|
||||
public void write(PrintStream out) {
|
||||
void write(PrintStream out) {
|
||||
|
||||
/*
|
||||
* Table header:
|
||||
|
|
@ -912,7 +822,7 @@ public class GATKReportTable {
|
|||
*
|
||||
* @param input Another GATK table
|
||||
*/
|
||||
protected void combineWith(GATKReportTable input) {
|
||||
void combineWith(GATKReportTable input) {
|
||||
/*
|
||||
* This function is different from addRowsFrom because we will add the ability to sum,average, etc rows
|
||||
* TODO: Add other combining algorithms
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -50,7 +50,7 @@ public enum GATKReportVersion {
|
|||
*/
|
||||
V1_0("v1.0");
|
||||
|
||||
public final String versionString;
|
||||
private final String versionString;
|
||||
|
||||
private GATKReportVersion(String versionString) {
|
||||
this.versionString = versionString;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -42,38 +44,31 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed
|
||||
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
final ArrayList<Double> isActiveList = new ArrayList<Double>();
|
||||
GenomeLoc firstIsActiveStart = null;
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
GenomeLoc location = locus.getLocation();
|
||||
|
||||
if(prevLoc != null) {
|
||||
for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) {
|
||||
// fill in the active / inactive labels from the stop of the previous location to the start of this location
|
||||
// TODO refactor to separate function
|
||||
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
|
||||
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
|
||||
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
|
||||
final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) );
|
||||
isActiveList.add( isActiveProb );
|
||||
if( firstIsActiveStart == null ) {
|
||||
firstIsActiveStart = fakeLoc;
|
||||
}
|
||||
final double isActiveProb = ( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 );
|
||||
profile.add(fakeLoc, isActiveProb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -89,12 +84,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
|
||||
final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus )
|
||||
: ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) );
|
||||
isActiveList.add( isActiveProb );
|
||||
if( firstIsActiveStart == null ) {
|
||||
firstIsActiveStart = location;
|
||||
}
|
||||
final double isActiveProb = walkerActiveProb(walker, tracker, refContext, locus, location);
|
||||
profile.add(location, isActiveProb);
|
||||
}
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
|
|
@ -103,52 +94,100 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
if( !locusView.hasNext() ) {
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
|
||||
}
|
||||
}
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(dataProvider.getShard(), locus.getLocation());
|
||||
}
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null );
|
||||
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
if( walker.activeRegionOutStream == null ) {
|
||||
workQueue.addAll( activeRegions );
|
||||
} else { // Just want to output the active regions to a file, not actually process them
|
||||
for( final ActiveRegion activeRegion : activeRegions ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
}
|
||||
}
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension );
|
||||
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
||||
}
|
||||
// add active regions to queue of regions to process
|
||||
workQueue.addAll( activeRegions );
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
public T endTraversal( final Walker<M,T> walker, T sum) {
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// simple utility functions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private final double walkerActiveProb(final ActiveRegionWalker<M,T> walker,
|
||||
final RefMetaDataTracker tracker, final ReferenceContext refContext,
|
||||
final AlignmentContext locus, final GenomeLoc location) {
|
||||
if ( walker.hasPresetActiveRegions() ) {
|
||||
return walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0;
|
||||
} else {
|
||||
return walker.isActive( tracker, refContext, locus );
|
||||
}
|
||||
}
|
||||
|
||||
private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
final LocusView locusView) {
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
return (RodLocusView)locusView;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// code to handle processing active regions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
|
||||
if( walker.activeRegionOutStream != null ) {
|
||||
writeActiveRegionsToStream(walker);
|
||||
return sum;
|
||||
} else {
|
||||
return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out each active region to the walker activeRegionOutStream
|
||||
*
|
||||
* @param walker
|
||||
*/
|
||||
private void writeActiveRegionsToStream( final ActiveRegionWalker<M,T> walker ) {
|
||||
// Just want to output the active regions to a file, not actually process them
|
||||
for( final ActiveRegion activeRegion : workQueue ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
// TODO can implement parallel traversal here
|
||||
while( workQueue.peek() != null ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker<M,T>) walker );
|
||||
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
|
||||
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
|
|
@ -193,6 +232,12 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// engine interaction code
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data.
|
||||
* @param walker walker to interrogate.
|
||||
|
|
@ -211,48 +256,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) {
|
||||
|
||||
final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author
|
||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||
if( activeList.size() == 0 ) {
|
||||
return returnList;
|
||||
} else if( activeList.size() == 1 ) {
|
||||
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()),
|
||||
activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
return returnList;
|
||||
} else {
|
||||
final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
|
||||
final double[] filteredProbArray = new double[activeProbArray.length];
|
||||
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author
|
||||
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author
|
||||
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||
double maxVal = 0;
|
||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
||||
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
||||
}
|
||||
filteredProbArray[iii] = maxVal;
|
||||
}
|
||||
|
||||
boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD;
|
||||
int curStart = 0;
|
||||
for(int iii = 1; iii < filteredProbArray.length; iii++ ) {
|
||||
final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD;
|
||||
if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
|
||||
returnList.add( new ActiveRegion(
|
||||
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)),
|
||||
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
curStatus = thisStatus;
|
||||
curStart = iii;
|
||||
}
|
||||
}
|
||||
if( curStart != filteredProbArray.length-1 ) {
|
||||
returnList.add( new ActiveRegion(
|
||||
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)),
|
||||
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
|
||||
}
|
||||
return returnList;
|
||||
}
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
*/
|
||||
public T endTraversal( final Walker<M,T> walker, T sum) {
|
||||
return processActiveRegions((ActiveRegionWalker<M,T>)walker, sum, Integer.MAX_VALUE, null);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
|
|||
|
||||
public GenomeLocSortedSet presetActiveRegions = null;
|
||||
|
||||
public boolean hasPresetActiveRegions() {
|
||||
return presetActiveRegions != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
if( activeRegionBindings == null ) { return; }
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
if (!vc.isBiallelic()) {
|
||||
// for non-bliallelic case, do test with most common alt allele.
|
||||
// Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
|
||||
int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount());
|
||||
int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount());
|
||||
idxAA = idxVector[0];
|
||||
idxAB = idxVector[1];
|
||||
idxBB = idxVector[2];
|
||||
|
|
|
|||
|
|
@ -31,8 +31,10 @@ public class LowMQ extends InfoFieldAnnotation {
|
|||
double total = 0;
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
|
||||
{
|
||||
ReadBackedPileup pileup = sample.getValue().getBasePileup();
|
||||
for (PileupElement p : pileup )
|
||||
if ( !sample.getValue().hasBasePileup() )
|
||||
continue;
|
||||
|
||||
for ( PileupElement p : sample.getValue().getBasePileup() )
|
||||
{
|
||||
if ( p.getMappingQual() == 0 ) { mq0 += 1; }
|
||||
if ( p.getMappingQual() <= 10 ) { mq10 += 1; }
|
||||
|
|
|
|||
|
|
@ -240,7 +240,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
|
||||
if ( infoline.getName().equals(expression.fieldName) ) {
|
||||
if ( infoline.getID().equals(expression.fieldName) ) {
|
||||
targetHeaderLine = infoline;
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatumOptimized;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 3/29/11
|
||||
*/
|
||||
|
||||
|
||||
public class BQSRGatherer extends Gatherer {
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private static final String EOF_MARKER = "EOF";
|
||||
|
||||
private HashMap<String, RecalDatumOptimized> dataMap = new HashMap<String, RecalDatumOptimized>();
|
||||
|
||||
|
||||
private void addCSVData (String line) {
|
||||
String[] covariates = line.split(",");
|
||||
String key = "";
|
||||
RecalDatumOptimized values;
|
||||
|
||||
for (int i = 0; i < covariates.length-3; i++)
|
||||
key += covariates[i] + ",";
|
||||
|
||||
if (covariates.length < 3)
|
||||
throw new ReviewedStingException("Line only has 1 covariate : " + line);
|
||||
|
||||
values = new RecalDatumOptimized(Long.parseLong(covariates[covariates.length - 3]), Long.parseLong(covariates[covariates.length - 2]));
|
||||
|
||||
RecalDatumOptimized currentValues = dataMap.get(key);
|
||||
if (currentValues == null)
|
||||
dataMap.put(key, values);
|
||||
else
|
||||
currentValues.increment(values);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
PrintStream o;
|
||||
try {
|
||||
o = new PrintStream(output);
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new UserException("File to be output by CountCovariates Gather function was not found");
|
||||
}
|
||||
|
||||
boolean sawEOF = false;
|
||||
boolean printedHeader = false;
|
||||
|
||||
// Read input files
|
||||
for ( File RECAL_FILE : inputs) {
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true; // sanity check
|
||||
break;
|
||||
}
|
||||
|
||||
else if(line.startsWith("#")) {
|
||||
if (!printedHeader)
|
||||
o.println(line);
|
||||
}
|
||||
|
||||
else // Found a line of data
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
printedHeader = true;
|
||||
}
|
||||
|
||||
// Write output file from dataMap
|
||||
for(Map.Entry<String, RecalDatumOptimized> entry : dataMap.entrySet())
|
||||
o.println(entry.getKey() + entry.getValue().outputToCSV());
|
||||
o.println("EOF");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,284 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR
|
||||
*
|
||||
* It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will
|
||||
* add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information
|
||||
* out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type.
|
||||
*
|
||||
* The keys represented by this key manager will always have the same order:
|
||||
*
|
||||
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType
|
||||
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType
|
||||
* ...
|
||||
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType
|
||||
*
|
||||
*
|
||||
* Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/6/12
|
||||
*/
|
||||
public class BQSRKeyManager {
|
||||
private List<RequiredCovariateInfo> requiredCovariates;
|
||||
private List<OptionalCovariateInfo> optionalCovariates;
|
||||
|
||||
private int nRequiredBits; // Number of bits used to represent the required covariates
|
||||
private int nOptionalBits; // Number of bits used to represent the standard covaraites
|
||||
private int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
|
||||
private int totalNumberOfBits; // Sum of all of the above plus the event bits
|
||||
|
||||
private BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
|
||||
private BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
|
||||
|
||||
/**
|
||||
* Initializes the KeyManager with the total number of covariates to use
|
||||
*
|
||||
* @param requiredCovariates the ordered list of required covariates
|
||||
* @param optionalCovariates the ordered list of optional covariates
|
||||
*/
|
||||
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
|
||||
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
|
||||
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
|
||||
|
||||
nRequiredBits = 0;
|
||||
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
|
||||
int nBits = required.numberOfBits(); // number of bits used by this covariate
|
||||
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
|
||||
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
|
||||
nRequiredBits += nBits;
|
||||
}
|
||||
|
||||
short i = 0;
|
||||
nOptionalBits = 0;
|
||||
for (Covariate optional : optionalCovariates) {
|
||||
int nBits = optional.numberOfBits(); // number of bits used by this covariate
|
||||
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
|
||||
BitSet optionalID = BitSetUtils.bitSetFrom(i); // calculate the optional covariate ID for this covariate
|
||||
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
|
||||
i++;
|
||||
}
|
||||
|
||||
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
|
||||
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
|
||||
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
|
||||
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates one key per optional covariate.
|
||||
*
|
||||
* Keys include all required covariates, the standard covariate and the event type.
|
||||
*
|
||||
* Example allKeys:
|
||||
* RG, QUAL, CYCLE, CONTEXT
|
||||
*
|
||||
* List of BitSets returned by this example (given eventType):
|
||||
* RG, QUAL, CYCLE, EVENT
|
||||
* RG, QUAL, CONTEXT, EVENT
|
||||
*
|
||||
* Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type
|
||||
*
|
||||
* @param allKeys The keys in bitset representation for each covariate
|
||||
* @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
|
||||
* @return one key in bitset representation per covariate
|
||||
*/
|
||||
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
|
||||
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
|
||||
|
||||
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
|
||||
|
||||
int covariateIndex = 0;
|
||||
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
|
||||
for (RequiredCovariateInfo infoRequired : requiredCovariates)
|
||||
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
|
||||
|
||||
for (OptionalCovariateInfo infoOptional : optionalCovariates) {
|
||||
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
|
||||
if (covariateKey == null)
|
||||
continue; // do not add nulls to the final set of keys.
|
||||
|
||||
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
|
||||
optionalKey.or(requiredKey); // import all the required covariates
|
||||
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
|
||||
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||
allBitSets.add(optionalKey); // add this key to the list of keys
|
||||
}
|
||||
|
||||
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
|
||||
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||
allBitSets.add(requiredKey); // add this key to the list of keys
|
||||
}
|
||||
|
||||
return allBitSets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates one bitset key for the covariates represented in Object[] key
|
||||
*
|
||||
* The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file)
|
||||
* and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many.
|
||||
*
|
||||
* Example key:
|
||||
* RG, QUAL, CYCLE, CYCLE_ID, EventType
|
||||
*
|
||||
* @param key list of objects produced by the required covariates followed by one or zero optional covariates.
|
||||
* @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface.
|
||||
*/
|
||||
public BitSet bitSetFromKey(Object[] key) {
|
||||
BitSet bitSetKey = new BitSet(totalNumberOfBits);
|
||||
|
||||
int requiredCovariate = 0;
|
||||
for (RequiredCovariateInfo infoRequired : requiredCovariates) {
|
||||
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
|
||||
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
|
||||
}
|
||||
|
||||
if (optionalCovariates.size() > 0) {
|
||||
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
|
||||
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
|
||||
int covariateID = (Short) key[covariateIDIndex]; // get the optional covariate id
|
||||
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
|
||||
|
||||
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
|
||||
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
|
||||
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||
}
|
||||
|
||||
int eventIndex = key.length - 1; // the event type is always the last key
|
||||
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
|
||||
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
|
||||
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
|
||||
|
||||
return bitSetKey;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generates a key set of objects from a combined bitset key.
|
||||
*
|
||||
* Masks out each covariate independently and decodes their values (Object) into a keyset
|
||||
*
|
||||
* @param key the bitset representation of the keys
|
||||
* @return an object array with the values for each key
|
||||
*/
|
||||
public List<Object> keySetFrom(BitSet key) {
|
||||
List<Object> objectKeys = new ArrayList<Object>();
|
||||
for (RequiredCovariateInfo info : requiredCovariates) {
|
||||
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
|
||||
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
|
||||
}
|
||||
|
||||
if (optionalCovariates.size() > 0) {
|
||||
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
|
||||
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits);// mask out the covariate order (to identify which covariate this is)
|
||||
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
|
||||
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
|
||||
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
|
||||
objectKeys.add(id); // add the covariate id
|
||||
}
|
||||
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
|
||||
|
||||
return objectKeys;
|
||||
}
|
||||
|
||||
/**
|
||||
* Translates a masked bitset into a bitset starting at 0
|
||||
*
|
||||
* @param key the masked out bitset
|
||||
* @param n the number of bits to chop
|
||||
* @return a translated bitset starting at 0 for the covariate machinery to decode
|
||||
*/
|
||||
private BitSet chopNBitsFrom(BitSet key, int n) {
|
||||
BitSet choppedKey = new BitSet();
|
||||
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
|
||||
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
|
||||
return choppedKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key
|
||||
*
|
||||
* @param leadingBits the index of the covariate in the ordered covariate list
|
||||
* @param nBits the number of bits needed by the Covariate to represent its values in BitSet form
|
||||
* @return the bitset relevant to the covariate
|
||||
*/
|
||||
|
||||
private BitSet genericMask(int leadingBits, int nBits) {
|
||||
BitSet mask = new BitSet(leadingBits + nBits);
|
||||
mask.set(leadingBits, leadingBits + nBits);
|
||||
return mask;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the event type (enum) from the full bitset key
|
||||
*
|
||||
* @param fullKey the full key of all covariates + event type
|
||||
* @return the decoded event type.
|
||||
*/
|
||||
private EventType eventFromBitSet(BitSet fullKey) {
|
||||
BitSet eventKey = new BitSet();
|
||||
int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits;
|
||||
for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1))
|
||||
eventKey.set(i - firstBitIndex);
|
||||
return EventType.eventFrom(BitSetUtils.shortFrom(eventKey));
|
||||
}
|
||||
|
||||
private BitSet bitSetFromEvent(EventType eventType) {
|
||||
return BitSetUtils.bitSetFrom(eventType.index);
|
||||
}
|
||||
|
||||
private int bitsInEventType() {
|
||||
return BitSetUtils.numberOfBitsToRepresent(EventType.values().length);
|
||||
}
|
||||
|
||||
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
|
||||
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
|
||||
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
|
||||
}
|
||||
|
||||
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
|
||||
BitSet bitSet = (BitSet) key.clone();
|
||||
bitSet.and(mask);
|
||||
return chopNBitsFrom(bitSet, leadingBits);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Aggregate information for each Covariate
|
||||
*/
|
||||
class RequiredCovariateInfo {
|
||||
public int bitsBefore; // number of bits before this covariate in the combined bitset key
|
||||
public int nBits; // number of bits used by this covariate (cached access to covariate.nBits())
|
||||
public BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
|
||||
public Covariate covariate; // this allows reverse lookup of the Covariates in order
|
||||
|
||||
RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) {
|
||||
this.bitsBefore = bitsBefore;
|
||||
this.nBits = nBits;
|
||||
this.mask = mask;
|
||||
this.covariate = covariate;
|
||||
}
|
||||
}
|
||||
|
||||
class OptionalCovariateInfo {
|
||||
public BitSet covariateID; // cache the covariate ID
|
||||
public Covariate covariate;
|
||||
|
||||
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
|
||||
this.covariateID = covariateID;
|
||||
this.covariate = covariate;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -26,7 +26,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -43,7 +45,12 @@ public class ContextCovariate implements StandardCovariate {
|
|||
|
||||
private int mismatchesContextSize;
|
||||
private int insertionsContextSize;
|
||||
private int deletionsContextSize;
|
||||
private int deletionsContextSize;
|
||||
|
||||
private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L);
|
||||
// protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it
|
||||
|
||||
private byte LOW_QUAL_TAIL;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
|
|
@ -52,18 +59,22 @@ public class ContextCovariate implements StandardCovariate {
|
|||
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
|
||||
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
|
||||
|
||||
LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL;
|
||||
|
||||
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
|
||||
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
public CovariateValues getValues(GATKSAMRecord read) {
|
||||
int l = read.getReadLength();
|
||||
BitSet[] mismatches = new BitSet[l];
|
||||
BitSet[] insertions = new BitSet[l];
|
||||
BitSet[] deletions = new BitSet[l];
|
||||
BitSet[] deletions = new BitSet[l];
|
||||
|
||||
read = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
|
||||
|
||||
final boolean negativeStrand = read.getReadNegativeStrandFlag();
|
||||
byte[] bases = read.getReadBases();
|
||||
if (negativeStrand)
|
||||
|
|
@ -72,7 +83,7 @@ public class ContextCovariate implements StandardCovariate {
|
|||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
|
||||
insertions[i] = contextWith(bases, i, insertionsContextSize);
|
||||
deletions[i] = contextWith(bases, i, deletionsContextSize);
|
||||
deletions[i] = contextWith(bases, i, deletionsContextSize);
|
||||
}
|
||||
|
||||
if (negativeStrand) {
|
||||
|
|
@ -89,24 +100,41 @@ public class ContextCovariate implements StandardCovariate {
|
|||
return str;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String keyFromBitSet(BitSet key) {
|
||||
if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file
|
||||
return null;
|
||||
|
||||
return BitSetUtils.dnaFrom(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return BitSetUtils.bitSetFrom((String) key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numberOfBits() {
|
||||
return Long.bitCount(-1L);
|
||||
}
|
||||
|
||||
/**
|
||||
* calculates the context of a base independent of the covariate mode
|
||||
* calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion)
|
||||
*
|
||||
* @param bases the bases in the read to build the context from
|
||||
* @param offset the position in the read to calculate the context for
|
||||
* @param contextSize context size to use building the context
|
||||
* @return
|
||||
* @param bases the bases in the read to build the context from
|
||||
* @param offset the position in the read to calculate the context for
|
||||
* @param contextSize context size to use building the context
|
||||
* @return the bitSet representing the Context
|
||||
*/
|
||||
private BitSet contextWith(byte [] bases, int offset, int contextSize) {
|
||||
if (offset < contextSize)
|
||||
return null;
|
||||
|
||||
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
|
||||
if (context.contains("N"))
|
||||
return null;
|
||||
|
||||
return MathUtils.bitSetFrom(context);
|
||||
}
|
||||
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
|
||||
BitSet result = null;
|
||||
if (offset >= contextSize) {
|
||||
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
|
||||
if (!context.contains("N"))
|
||||
result = BitSetUtils.bitSetFrom(context);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses the given array in place.
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
|
|
@ -53,7 +55,40 @@ public interface Covariate {
|
|||
*/
|
||||
public CovariateValues getValues(GATKSAMRecord read);
|
||||
|
||||
public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
/**
|
||||
* Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
*
|
||||
* @param str the key in string type (read from the csv)
|
||||
* @return the key in it's correct type.
|
||||
*/
|
||||
public Object getValue(String str);
|
||||
|
||||
/**
|
||||
* Converts the bitset representation of the key (used internally for table indexing) to String format for file output.
|
||||
*
|
||||
* @param key the bitset representation of the key
|
||||
* @return a string representation of the key
|
||||
*/
|
||||
public String keyFromBitSet(BitSet key);
|
||||
|
||||
/**
|
||||
* Converts a key into a bitset
|
||||
*
|
||||
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates
|
||||
* the getValues method already returns all values in BitSet format.
|
||||
*
|
||||
* @param key the object corresponding to the covariate
|
||||
* @return a bitset representation of the object
|
||||
*/
|
||||
public BitSet bitSetFromKey(Object key);
|
||||
|
||||
/**
|
||||
* Each covariate should determine how many bits are necessary to encode it's data
|
||||
*
|
||||
* @return The number of bits used to represent the values of this covariate.
|
||||
*/
|
||||
public int numberOfBits();
|
||||
|
||||
}
|
||||
|
||||
interface RequiredCovariate extends Covariate {}
|
||||
|
|
|
|||
|
|
@ -1,88 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* The object temporarily held by a read that describes all of it's covariates.
|
||||
*
|
||||
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/8/12
|
||||
*/
|
||||
public class CovariateKeySet {
|
||||
private Object[][] mismatchesKeySet;
|
||||
private Object[][] insertionsKeySet;
|
||||
private Object[][] deletionsKeySet;
|
||||
|
||||
private int nextCovariateIndex;
|
||||
|
||||
private static String mismatchesCovariateName = "M";
|
||||
private static String insertionsCovariateName = "I";
|
||||
private static String deletionsCovariateName = "D";
|
||||
|
||||
public CovariateKeySet(int readLength, int numberOfCovariates) {
|
||||
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
|
||||
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
|
||||
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
|
||||
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
|
||||
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName);
|
||||
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName);
|
||||
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName);
|
||||
this.nextCovariateIndex = 0;
|
||||
}
|
||||
|
||||
public void addCovariate(CovariateValues covariate) {
|
||||
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
|
||||
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
|
||||
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
|
||||
nextCovariateIndex++;
|
||||
}
|
||||
|
||||
public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) {
|
||||
if (modelString.equals(mismatchesCovariateName))
|
||||
return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION;
|
||||
else if (modelString.equals(insertionsCovariateName))
|
||||
return RecalDataManager.BaseRecalibrationType.BASE_INSERTION;
|
||||
else if (modelString.equals(deletionsCovariateName))
|
||||
return RecalDataManager.BaseRecalibrationType.BASE_DELETION;
|
||||
throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString);
|
||||
}
|
||||
|
||||
public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) {
|
||||
switch (errorModel) {
|
||||
case BASE_SUBSTITUTION:
|
||||
return getMismatchesKeySet(readPosition);
|
||||
case BASE_INSERTION:
|
||||
return getInsertionsKeySet(readPosition);
|
||||
case BASE_DELETION:
|
||||
return getDeletionsKeySet(readPosition);
|
||||
default:
|
||||
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
|
||||
}
|
||||
}
|
||||
|
||||
public Object[] getMismatchesKeySet(int readPosition) {
|
||||
return mismatchesKeySet[readPosition];
|
||||
}
|
||||
|
||||
public Object[] getInsertionsKeySet(int readPosition) {
|
||||
return insertionsKeySet[readPosition];
|
||||
}
|
||||
|
||||
public Object[] getDeletionsKeySet(int readPosition) {
|
||||
return deletionsKeySet[readPosition];
|
||||
}
|
||||
|
||||
private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) {
|
||||
for (int i=0; i<covariateValues.length; i++)
|
||||
keySet[i][nextCovariateIndex] = covariateValues[i];
|
||||
}
|
||||
|
||||
private void initializeCovariateKeySet (Object[][] keySet, String covariateName) {
|
||||
int readLength = keySet.length;
|
||||
int lastCovariateIndex = keySet[0].length - 1;
|
||||
for (int i = 0; i < readLength; i++)
|
||||
keySet[i][lastCovariateIndex] = covariateName;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* An object to hold the different covariate values for all bases in the read.
|
||||
*
|
||||
|
|
@ -12,25 +14,25 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
* @since 2/8/12
|
||||
*/
|
||||
public class CovariateValues {
|
||||
private Object[] mismatches;
|
||||
private Object[] insertions;
|
||||
private Object[] deletions;
|
||||
private BitSet[] mismatches;
|
||||
private BitSet[] insertions;
|
||||
private BitSet[] deletions;
|
||||
|
||||
public CovariateValues(Object[] mismatch, Object[] insertion, Object[] deletion) {
|
||||
public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) {
|
||||
this.mismatches = mismatch;
|
||||
this.insertions = insertion;
|
||||
this.deletions = deletion;
|
||||
}
|
||||
|
||||
public Object[] getMismatches() {
|
||||
public BitSet[] getMismatches() {
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
public Object[] getInsertions() {
|
||||
public BitSet[] getInsertions() {
|
||||
return insertions;
|
||||
}
|
||||
|
||||
public Object[] getDeletions() {
|
||||
public BitSet[] getDeletions() {
|
||||
return deletions;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.EnumSet;
|
||||
|
||||
/*
|
||||
|
|
@ -59,48 +61,25 @@ public class CycleCovariate implements StandardCovariate {
|
|||
// Used to pick out the covariate's value from attributes of the read
|
||||
@Override
|
||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
Integer [] cycles = new Integer[read.getReadLength()];
|
||||
BitSet[] cycles = new BitSet[read.getReadLength()];
|
||||
final NGSPlatform ngsPlatform = read.getNGSPlatform();
|
||||
|
||||
// Discrete cycle platforms
|
||||
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
final int init;
|
||||
final int increment;
|
||||
final short init;
|
||||
final short increment;
|
||||
if (!read.getReadNegativeStrandFlag()) {
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, positive strand
|
||||
init = -1;
|
||||
increment = -1;
|
||||
}
|
||||
else {
|
||||
//first of pair, positive strand
|
||||
init = 1;
|
||||
increment = 1;
|
||||
}
|
||||
|
||||
init = 1;
|
||||
increment = 1;
|
||||
}
|
||||
else {
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, negative strand
|
||||
init = -read.getReadLength();
|
||||
increment = 1;
|
||||
}
|
||||
else {
|
||||
//first of pair, negative strand
|
||||
init = read.getReadLength();
|
||||
increment = -1;
|
||||
}
|
||||
init = (short) read.getReadLength();
|
||||
increment = -1;
|
||||
}
|
||||
|
||||
int cycle = init;
|
||||
short cycle = init;
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
cycles[i] = cycle;
|
||||
cycles[i] = BitSetUtils.bitSetFrom(cycle);
|
||||
cycle += increment;
|
||||
}
|
||||
}
|
||||
|
|
@ -119,7 +98,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
|
||||
|
||||
int cycle = multiplyByNegative1 ? -1 : 1;
|
||||
short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
|
||||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
|
|
@ -127,19 +106,19 @@ public class CycleCovariate implements StandardCovariate {
|
|||
int iii = 0;
|
||||
while (iii < readLength) {
|
||||
while (iii < readLength && bases[iii] == (byte) 'T') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'A') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'C') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'G') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii++;
|
||||
}
|
||||
if (iii < readLength) {
|
||||
|
|
@ -149,7 +128,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
cycle++;
|
||||
}
|
||||
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii++;
|
||||
}
|
||||
|
||||
|
|
@ -159,19 +138,19 @@ public class CycleCovariate implements StandardCovariate {
|
|||
int iii = readLength - 1;
|
||||
while (iii >= 0) {
|
||||
while (iii >= 0 && bases[iii] == (byte) 'T') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'A') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'C') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'G') {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii--;
|
||||
}
|
||||
if (iii >= 0) {
|
||||
|
|
@ -181,7 +160,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
cycle++;
|
||||
}
|
||||
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
cycles[iii] = cycle;
|
||||
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||
iii--;
|
||||
}
|
||||
}
|
||||
|
|
@ -192,13 +171,28 @@ public class CycleCovariate implements StandardCovariate {
|
|||
else {
|
||||
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
|
||||
}
|
||||
|
||||
|
||||
return new CovariateValues(cycles, cycles, cycles);
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
return Short.parseShort(str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String keyFromBitSet(BitSet key) {
|
||||
return String.format("%d", BitSetUtils.shortFrom(key));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return BitSetUtils.bitSetFrom((Short) key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numberOfBits() {
|
||||
return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
public enum EventType {
|
||||
BASE_SUBSTITUTION(0, "M"),
|
||||
BASE_INSERTION(1, "I"),
|
||||
BASE_DELETION(2, "D");
|
||||
|
||||
public int index;
|
||||
public String representation;
|
||||
|
||||
private EventType(int index, String representation) {
|
||||
this.index = index;
|
||||
this.representation = representation;
|
||||
}
|
||||
|
||||
public static EventType eventFrom(int index) {
|
||||
switch (index) {
|
||||
case 0:
|
||||
return BASE_SUBSTITUTION;
|
||||
case 1:
|
||||
return BASE_INSERTION;
|
||||
case 2:
|
||||
return BASE_DELETION;
|
||||
default:
|
||||
throw new ReviewedStingException(String.format("Event %d does not exist.", index));
|
||||
}
|
||||
}
|
||||
|
||||
public static EventType eventFrom(String event) {
|
||||
for (EventType eventType : EventType.values())
|
||||
if (eventType.representation.equals(event))
|
||||
return eventType;
|
||||
|
||||
throw new ReviewedStingException(String.format("Event %s does not exist.", event));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return representation;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
|
|
@ -46,18 +50,18 @@ public class QualityScoreCovariate implements RequiredCovariate {
|
|||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
int readLength = read.getReadLength();
|
||||
|
||||
Integer [] mismatches = new Integer[readLength];
|
||||
Integer [] insertions = new Integer[readLength];
|
||||
Integer [] deletions = new Integer[readLength];
|
||||
BitSet[] mismatches = new BitSet[readLength];
|
||||
BitSet[] insertions = new BitSet[readLength];
|
||||
BitSet[] deletions = new BitSet[readLength];
|
||||
|
||||
byte [] baseQualities = read.getBaseQualities();
|
||||
byte [] baseInsertionQualities = read.getBaseInsertionQualities();
|
||||
byte [] baseDeletionQualities = read.getBaseDeletionQualities();
|
||||
byte[] baseQualities = read.getBaseQualities();
|
||||
byte[] baseInsertionQualities = read.getBaseInsertionQualities();
|
||||
byte[] baseDeletionQualities = read.getBaseDeletionQualities();
|
||||
|
||||
for (int i=0; i<baseQualities.length; i++) {
|
||||
mismatches[i] = (int) baseQualities[i];
|
||||
insertions[i] = (int) baseInsertionQualities[i];
|
||||
deletions[i] = (int) baseDeletionQualities[i];
|
||||
for (int i = 0; i < baseQualities.length; i++) {
|
||||
mismatches[i] = BitSetUtils.bitSetFrom(baseQualities[i]);
|
||||
insertions[i] = BitSetUtils.bitSetFrom(baseInsertionQualities[i]);
|
||||
deletions[i] = BitSetUtils.bitSetFrom(baseDeletionQualities[i]);
|
||||
}
|
||||
|
||||
return new CovariateValues(mismatches, insertions, deletions);
|
||||
|
|
@ -66,6 +70,21 @@ public class QualityScoreCovariate implements RequiredCovariate {
|
|||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||
@Override
|
||||
public final Object getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
return Byte.parseByte(str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String keyFromBitSet(BitSet key) {
|
||||
return String.format("%d", BitSetUtils.longFrom(key));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return BitSetUtils.bitSetFrom((Byte) key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numberOfBits() {
|
||||
return BitSetUtils.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* The object temporarily held by a read that describes all of it's covariates.
|
||||
*
|
||||
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/8/12
|
||||
*/
|
||||
public class ReadCovariates {
|
||||
private BitSet[][] mismatchesKeySet;
|
||||
private BitSet[][] insertionsKeySet;
|
||||
private BitSet[][] deletionsKeySet;
|
||||
|
||||
private int nextCovariateIndex;
|
||||
|
||||
public ReadCovariates(int readLength, int numberOfCovariates) {
|
||||
this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates];
|
||||
this.insertionsKeySet = new BitSet[readLength][numberOfCovariates];
|
||||
this.deletionsKeySet = new BitSet[readLength][numberOfCovariates];
|
||||
this.nextCovariateIndex = 0;
|
||||
}
|
||||
|
||||
public void addCovariate(CovariateValues covariate) {
|
||||
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
|
||||
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
|
||||
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
|
||||
nextCovariateIndex++;
|
||||
}
|
||||
|
||||
public BitSet[] getKeySet(final int readPosition, final EventType errorModel) {
|
||||
switch (errorModel) {
|
||||
case BASE_SUBSTITUTION:
|
||||
return getMismatchesKeySet(readPosition);
|
||||
case BASE_INSERTION:
|
||||
return getInsertionsKeySet(readPosition);
|
||||
case BASE_DELETION:
|
||||
return getDeletionsKeySet(readPosition);
|
||||
default:
|
||||
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel);
|
||||
}
|
||||
}
|
||||
|
||||
public BitSet[] getMismatchesKeySet(int readPosition) {
|
||||
return mismatchesKeySet[readPosition];
|
||||
}
|
||||
|
||||
public BitSet[] getInsertionsKeySet(int readPosition) {
|
||||
return insertionsKeySet[readPosition];
|
||||
}
|
||||
|
||||
public BitSet[] getDeletionsKeySet(int readPosition) {
|
||||
return deletionsKeySet[readPosition];
|
||||
}
|
||||
|
||||
private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) {
|
||||
for (int i = 0; i < covariateValues.length; i++)
|
||||
keySet[i][nextCovariateIndex] = covariateValues[i];
|
||||
}
|
||||
}
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
|
||||
/*
|
||||
|
|
@ -39,7 +41,7 @@ import java.util.HashMap;
|
|||
*/
|
||||
|
||||
public class ReadGroupCovariate implements RequiredCovariate {
|
||||
|
||||
|
||||
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
|
||||
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
||||
private short nextId = 0;
|
||||
|
|
@ -53,17 +55,9 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
|||
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||
final int l = read.getReadLength();
|
||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
||||
short shortId;
|
||||
if (readGroupLookupTable.containsKey(readGroupId))
|
||||
shortId = readGroupLookupTable.get(readGroupId);
|
||||
else {
|
||||
shortId = nextId;
|
||||
readGroupLookupTable.put(readGroupId, nextId);
|
||||
readGroupReverseLookupTable.put(nextId, readGroupId);
|
||||
nextId++;
|
||||
}
|
||||
Short [] readGroups = new Short[l];
|
||||
Arrays.fill(readGroups, shortId);
|
||||
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
|
||||
BitSet[] readGroups = new BitSet[l];
|
||||
Arrays.fill(readGroups, rg);
|
||||
return new CovariateValues(readGroups, readGroups, readGroups);
|
||||
}
|
||||
|
||||
|
|
@ -72,10 +66,38 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
|||
public final Object getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String keyFromBitSet(BitSet key) {
|
||||
return decodeReadGroup((short) BitSetUtils.longFrom(key));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BitSet bitSetFromKey(Object key) {
|
||||
return bitSetForReadGroup((String) key);
|
||||
}
|
||||
|
||||
public final String decodeReadGroup(final short id) {
|
||||
return readGroupReverseLookupTable.get(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numberOfBits() {
|
||||
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
|
||||
}
|
||||
|
||||
private BitSet bitSetForReadGroup(String readGroupId) {
|
||||
short shortId;
|
||||
if (readGroupLookupTable.containsKey(readGroupId))
|
||||
shortId = readGroupLookupTable.get(readGroupId);
|
||||
else {
|
||||
shortId = nextId;
|
||||
readGroupLookupTable.put(readGroupId, nextId);
|
||||
readGroupReverseLookupTable.put(nextId, readGroupId);
|
||||
nextId++;
|
||||
}
|
||||
return BitSetUtils.bitSetFrom(shortId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -53,24 +53,18 @@ import java.util.Map;
|
|||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
public final NestedHashMap nestedHashMap; // The full dataset
|
||||
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private final HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
public final NestedHashMap nestedHashMap; // The full dataset
|
||||
private final HashMap<EventType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private final HashMap<EventType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private final HashMap<EventType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
|
||||
public enum BaseRecalibrationType {
|
||||
BASE_SUBSTITUTION,
|
||||
BASE_INSERTION,
|
||||
BASE_DELETION
|
||||
}
|
||||
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/**
|
||||
|
|
@ -116,10 +110,10 @@ public class RecalDataManager {
|
|||
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
|
||||
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
|
||||
nestedHashMap = null;
|
||||
dataCollapsedReadGroup = new HashMap<BaseRecalibrationType, NestedHashMap>();
|
||||
dataCollapsedQualityScore = new HashMap<BaseRecalibrationType, NestedHashMap>();
|
||||
dataCollapsedByCovariate = new HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>>();
|
||||
for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
|
||||
dataCollapsedReadGroup = new HashMap<EventType, NestedHashMap>();
|
||||
dataCollapsedQualityScore = new HashMap<EventType, NestedHashMap>();
|
||||
dataCollapsedByCovariate = new HashMap<EventType, ArrayList<NestedHashMap>>();
|
||||
for (final EventType errorModel : EventType.values()) {
|
||||
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
|
||||
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
|
||||
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
|
||||
|
|
@ -136,100 +130,10 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) {
|
||||
return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
*
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
final int qualityScore = Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
qualityScoreCollapsedKey[1] = key[1]; // and quality score
|
||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) {
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
covariateCollapsedKey[1] = key[1]; // and quality score ...
|
||||
final Object theCovariateElement = key[iii + 2]; // and the given covariate
|
||||
if (theCovariateElement != null) {
|
||||
covariateCollapsedKey[2] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
}
|
||||
}
|
||||
public static ReadCovariates covariateKeySetFrom(GATKSAMRecord read) {
|
||||
return (ReadCovariates) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
*
|
||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||
* @param maxQual At which value to cap the quality scores
|
||||
*/
|
||||
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
|
||||
|
||||
for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual);
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual);
|
||||
for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) {
|
||||
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
|
||||
checkForSingletons(map.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
|
||||
|
||||
for (Object comp : data.keySet()) {
|
||||
final Object val = data.get(comp);
|
||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
||||
((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForSingletons(final Map data) {
|
||||
// todo -- this looks like it's better just as a data.valueSet() call?
|
||||
|
|
@ -253,7 +157,7 @@ public class RecalDataManager {
|
|||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) {
|
||||
public final NestedHashMap getCollapsedTable(final int covariate, final EventType errorModel) {
|
||||
if (covariate == 0) {
|
||||
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
|
||||
}
|
||||
|
|
@ -551,6 +455,7 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Given the base and the color calculate the next base in the sequence
|
||||
*
|
||||
* @param read the read
|
||||
* @param prevBase The base
|
||||
* @param color The color
|
||||
* @return The next base in the sequence
|
||||
|
|
@ -615,22 +520,23 @@ public class RecalDataManager {
|
|||
* Computes all requested covariates for every offset in the given read
|
||||
* by calling covariate.getValues(..).
|
||||
*
|
||||
* It populates an array of covariate values where result[i][j] is the covariate
|
||||
* value for the ith position in the read and the jth covariate in
|
||||
* reqeustedCovariates list.
|
||||
*
|
||||
* @param read The read for which to compute covariate values.
|
||||
* @param requestedCovariates The list of requested covariates.
|
||||
* @return An array of covariate values where result[i][j] is the covariate
|
||||
* value for the ith position in the read and the jth covariate in
|
||||
* reqeustedCovariates list.
|
||||
*/
|
||||
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
|
||||
final int numRequestedCovariates = requestedCovariates.size();
|
||||
final int readLength = read.getReadLength();
|
||||
final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates);
|
||||
final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates);
|
||||
|
||||
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||
for (Covariate covariate : requestedCovariates)
|
||||
covariateKeySet.addCovariate(covariate.getValues(read));
|
||||
readCovariates.addCovariate(covariate.getValues(read));
|
||||
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet);
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, readCovariates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ public class RecalDatumOptimized {
|
|||
public final double empiricalQualDouble(final int smoothing, final double maxQual) {
|
||||
final double doubleMismatches = (double) (numMismatches + smoothing);
|
||||
final double doubleObservations = (double) (numObservations + smoothing);
|
||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
return Math.min(empiricalQual, maxQual);
|
||||
}
|
||||
|
||||
|
|
@ -106,9 +106,10 @@ public class RecalDatumOptimized {
|
|||
|
||||
public final byte empiricalQualByte() {
|
||||
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
|
||||
}
|
||||
}
|
||||
|
||||
public final String outputToCSV() {
|
||||
@Override
|
||||
public final String toString() {
|
||||
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,10 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -60,7 +58,7 @@ public class RecalibrationArgumentCollection {
|
|||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Gather(CountCovariatesGatherer.class)
|
||||
@Gather(BQSRGatherer.class)
|
||||
@Output
|
||||
protected PrintStream RECAL_FILE;
|
||||
|
||||
|
|
@ -92,16 +90,6 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
protected boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
// protected Member Variables
|
||||
/////////////////////////////
|
||||
protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
|
||||
protected final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();// A list to hold the covariate objects that were requested
|
||||
|
||||
protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
|
||||
protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
|
||||
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* reads which have had the reference inserted because of color space inconsistencies.
|
||||
|
|
@ -153,6 +141,10 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
|
||||
public byte DELETIONS_DEFAULT_QUALITY = 45;
|
||||
|
||||
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
|
||||
public byte LOW_QUAL_TAIL = 2;
|
||||
|
||||
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -244,7 +244,8 @@ public class DiffEngine {
|
|||
table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount());
|
||||
table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString());
|
||||
}
|
||||
table.write(params.out);
|
||||
GATKReport output = new GATKReport(table);
|
||||
output.print(params.out);
|
||||
}
|
||||
|
||||
protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) {
|
||||
|
|
|
|||
|
|
@ -68,8 +68,8 @@ public class VCFDiffableReader implements DiffableReader {
|
|||
VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader);
|
||||
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
|
||||
String key = headerLine.getKey();
|
||||
if ( headerLine instanceof VCFNamedHeaderLine )
|
||||
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
|
||||
if ( headerLine instanceof VCFIDHeaderLine)
|
||||
key += "_" + ((VCFIDHeaderLine) headerLine).getID();
|
||||
if ( root.hasElement(key) )
|
||||
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
|
||||
else
|
||||
|
|
|
|||
|
|
@ -64,7 +64,6 @@ public enum DiploidGenotype {
|
|||
return r != base2;
|
||||
else
|
||||
return base2 == r;
|
||||
//return MathUtils.countOccurrences(r, this.toString()) == 1;
|
||||
}
|
||||
|
||||
public boolean isHom() {
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -61,7 +60,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
|
||||
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false);
|
||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
|
@ -85,21 +84,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// make sure that we've cached enough data
|
||||
if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 )
|
||||
UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles);
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL];
|
||||
if ( alleles[0] != 0 )
|
||||
likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
|
||||
if ( alleles.alleleIndex1 != 0 )
|
||||
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != 0 && alleles[1] != alleles[0] )
|
||||
likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -189,24 +184,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// the column of the matrix
|
||||
final double[] log10Likelihoods;
|
||||
|
||||
// mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column;
|
||||
// for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB.
|
||||
final HashMap<ExactACcounts, Integer> ACsetIndexToPLIndex = new HashMap<ExactACcounts, Integer>();
|
||||
|
||||
// to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them
|
||||
final ArrayList<ExactACcounts> dependentACsetsToDelete = new ArrayList<ExactACcounts>();
|
||||
|
||||
int sum = -1;
|
||||
|
||||
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
||||
this.ACcounts = ACcounts;
|
||||
log10Likelihoods = new double[size];
|
||||
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// sum of all the non-reference alleles
|
||||
public int getACsum() {
|
||||
int sum = 0;
|
||||
for ( int count : ACcounts.getCounts() )
|
||||
sum += count;
|
||||
if ( sum == -1 ) {
|
||||
sum = 0;
|
||||
for ( int count : ACcounts.getCounts() )
|
||||
sum += count;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
|
@ -215,15 +207,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO -- remove me
|
||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final boolean preserveData) {
|
||||
final boolean foo) {
|
||||
linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
// make sure the PL cache has been initialized
|
||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
|
||||
UnifiedGenotyperEngine.calculatePLcache(5);
|
||||
|
||||
|
||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
|
|
@ -241,21 +239,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// optimization: create the temporary storage for computing L(j,k) just once
|
||||
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
|
||||
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
|
||||
for ( int i = 0; i < maxPossibleDependencies; i++ )
|
||||
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -273,27 +270,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double maxLog10L,
|
||||
final int numChr,
|
||||
final boolean preserveData,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
||||
|
||||
// clean up memory
|
||||
if ( !preserveData ) {
|
||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
||||
indexesToACset.remove(index);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
||||
}
|
||||
}
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
|
||||
|
||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
|
|
@ -301,11 +287,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
|
||||
// no reason to keep this data around because nothing depends on it
|
||||
if ( !preserveData )
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
|
|
@ -316,15 +297,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
|
||||
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
|
||||
|
||||
// add conformations for the k+1 case
|
||||
int PLindex = 0;
|
||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
||||
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
|
|
@ -338,71 +317,51 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
|
||||
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
|
||||
if ( !preserveData ) {
|
||||
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||
if ( lastSet != null )
|
||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also adds it as a dependency to the given callingSetIndex.
|
||||
// returns the ExactACset if that set was not already in the queue and null otherwise.
|
||||
private static void updateACset(final int[] ACcounts,
|
||||
// also pushes its value to the given callingSetIndex.
|
||||
private static void updateACset(final int[] newSetCounts,
|
||||
final int numChr,
|
||||
final ExactACset callingSet,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
final ExactACcounts index = new ExactACcounts(ACcounts);
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
}
|
||||
|
||||
// add the given dependency to the set
|
||||
// push data from the dependency to the new set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
|
||||
final ExactACset set = indexesToACset.get(index);
|
||||
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
|
||||
}
|
||||
|
||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
|
||||
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
|
||||
while ( reverseIterator.hasNext() ) {
|
||||
final ExactACset queued = reverseIterator.next();
|
||||
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
|
||||
return queued;
|
||||
}
|
||||
|
||||
// shouldn't get here
|
||||
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
|
||||
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double[][] tempLog10ConformationLikelihoods) {
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
|
@ -414,42 +373,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
// k > 0 for at least one k
|
||||
else {
|
||||
// deal with the non-AA possible conformations
|
||||
int conformationIndex = 1;
|
||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||
|
||||
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
||||
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] =
|
||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
conformationIndex++;
|
||||
}
|
||||
|
||||
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
||||
// the non-AA possible conformations were dealt with by pushes from dependent sets;
|
||||
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
} else {
|
||||
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
|
||||
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
|
||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -478,6 +413,23 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
private static void pushData(final ExactACset targetSet,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final int totalK = targetSet.getACsum();
|
||||
|
||||
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue =
|
||||
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
|
||||
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
|
|
@ -488,25 +440,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// BC: 2 * k_b * k_c
|
||||
// CC: k_c * (k_c - 1)
|
||||
|
||||
final int numAltAlleles = ACcounts.length;
|
||||
// find the 2 alleles that are represented by this PL index
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
|
||||
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
|
||||
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
|
||||
|
||||
// the AX het case
|
||||
if ( PLindex <= numAltAlleles )
|
||||
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
if ( alleles.alleleIndex1 == 0 )
|
||||
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
// find the 2 alternate alleles that are represented by this PL index
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
|
||||
|
||||
final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
|
||||
final int k_i = ACcounts[alleles.alleleIndex1-1];
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( alleles[0] == alleles[1] ) {
|
||||
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[alleles[1]-1];
|
||||
final int k_j = ACcounts[alleles.alleleIndex2-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -240,6 +240,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
refAllele = Allele.create(refBases, true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
}
|
||||
else continue; // don't go on with this allele if refBases are non-standard
|
||||
} else {
|
||||
// insertion case
|
||||
if (Allele.acceptableAlleleBases(s)) {
|
||||
|
|
@ -247,6 +248,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
altAllele = Allele.create(s, false);
|
||||
stop = loc.getStart();
|
||||
}
|
||||
else continue; // go on to next allele if consensus insertion has any non-standard base.
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -30,7 +30,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -53,10 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
|
||||
// make sure the PL cache has been initialized with enough alleles
|
||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles
|
||||
UnifiedGenotyperEngine.calculatePLcache(3);
|
||||
}
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
|
|
@ -133,6 +132,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
}
|
||||
builder.alleles(alleles);
|
||||
|
||||
// create the PL ordering to use based on the allele ordering.
|
||||
final int[] PLordering = new int[numLikelihoods];
|
||||
for ( int i = 0; i <= numAltAlleles; i++ ) {
|
||||
for ( int j = i; j <= numAltAlleles; j++ ) {
|
||||
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
|
||||
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
|
||||
PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal();
|
||||
}
|
||||
}
|
||||
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
|
|
@ -142,12 +151,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
||||
final double[] myLikelihoods = new double[numLikelihoods];
|
||||
|
||||
int myLikelihoodsIndex = 0;
|
||||
for ( int i = 0; i <= numAltAlleles; i++ ) {
|
||||
for ( int j = i; j <= numAltAlleles; j++ ) {
|
||||
myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()];
|
||||
}
|
||||
}
|
||||
for ( int i = 0; i < numLikelihoods; i++ )
|
||||
myLikelihoods[i] = allLikelihoods[PLordering[i]];
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
|
|
@ -174,12 +179,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
final double[] likelihoods = sampleData.GL.getLikelihoods();
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PLindexOfRef ) {
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
|
||||
if ( alleles[0] != baseIndexOfRef )
|
||||
likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePairUsingDeprecatedOrdering(PLindexOfBestGL);
|
||||
if ( alleles.alleleIndex1 != baseIndexOfRef )
|
||||
likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
// don't double-count it
|
||||
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
|
||||
likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
if ( alleles.alleleIndex2 != baseIndexOfRef && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||
likelihoodSums[alleles.alleleIndex2] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -203,7 +208,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
|
||||
public class BAQedPileupElement extends PileupElement {
|
||||
public BAQedPileupElement( final PileupElement PE ) {
|
||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip());
|
||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isAfterDeletion(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -104,10 +104,6 @@ public class UnifiedGenotyperEngine {
|
|||
private final GenomeLocParser genomeLocParser;
|
||||
private final boolean BAQEnabledOnCMDLine;
|
||||
|
||||
// a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
|
||||
// the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)]
|
||||
protected static int[][][] PLIndexToAlleleIndex;
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -140,27 +136,6 @@ public class UnifiedGenotyperEngine {
|
|||
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
calculatePLcache(UAC.MAX_ALTERNATE_ALLELES);
|
||||
}
|
||||
|
||||
protected static void calculatePLcache(int maxAltAlleles) {
|
||||
PLIndexToAlleleIndex = new int[maxAltAlleles+1][][];
|
||||
PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} };
|
||||
int numLikelihoods = 1;
|
||||
|
||||
// for each count of alternate alleles
|
||||
for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) {
|
||||
numLikelihoods += altAlleles + 1;
|
||||
PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][];
|
||||
int PLindex = 0;
|
||||
|
||||
// for all possible combinations of the 2 alt alleles
|
||||
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
|
||||
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
|
||||
PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -794,21 +769,17 @@ public class UnifiedGenotyperEngine {
|
|||
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
|
||||
likelihoodIndexesToUse = new ArrayList<Integer>(30);
|
||||
|
||||
// make sure that we've cached enough data
|
||||
if ( numOriginalAltAlleles > PLIndexToAlleleIndex.length - 1 )
|
||||
calculatePLcache(numOriginalAltAlleles);
|
||||
final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
|
||||
|
||||
final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
|
||||
if ( allelesToUse.contains(vc.getAlternateAllele(i)) )
|
||||
altAlleleIndexToUse[i] = true;
|
||||
}
|
||||
|
||||
for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) {
|
||||
final int[] alleles = PLcache[PLindex];
|
||||
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles);
|
||||
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
|
||||
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
// consider this entry only if both of the alleles are good
|
||||
if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) )
|
||||
if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) )
|
||||
likelihoodIndexesToUse.add(PLindex);
|
||||
}
|
||||
}
|
||||
|
|
@ -861,11 +832,11 @@ public class UnifiedGenotyperEngine {
|
|||
protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final int numNewAltAlleles, final Map<String, Object> attrs) {
|
||||
// find the genotype with maximum likelihoods
|
||||
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
myAlleles.add(allelesToUse.get(alleles[0]));
|
||||
myAlleles.add(allelesToUse.get(alleles[1]));
|
||||
myAlleles.add(allelesToUse.get(alleles.alleleIndex1));
|
||||
myAlleles.add(allelesToUse.get(alleles.alleleIndex2));
|
||||
|
||||
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
|
||||
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
|
||||
|
|
|
|||
|
|
@ -243,6 +243,19 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
|
||||
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
|
||||
|
||||
// TODO -- clean this up in a refactoring
|
||||
// merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type)
|
||||
if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) {
|
||||
final List<VariantContext> refs = VCsByType.remove(VariantContext.Type.NO_VARIATION);
|
||||
for ( VariantContext.Type type : VariantContext.Type.values() ) {
|
||||
if ( VCsByType.containsKey(type) ) {
|
||||
VCsByType.get(type).addAll(refs);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// iterate over the types so that it's deterministic
|
||||
for (VariantContext.Type type : VariantContext.Type.values()) {
|
||||
if (VCsByType.containsKey(type))
|
||||
|
|
|
|||
|
|
@ -216,12 +216,12 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName())));
|
||||
//hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
|
||||
//hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
//hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getID()));
|
||||
|
||||
allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY);
|
||||
for ( VCFHeaderLine field : hInfo ) {
|
||||
if ( field instanceof VCFFormatHeaderLine) {
|
||||
allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName());
|
||||
allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getID());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,284 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* Utilities for bitset conversion
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/5/12
|
||||
*/
|
||||
public class BitSetUtils {
|
||||
|
||||
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
|
||||
static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers)
|
||||
static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers)
|
||||
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
|
||||
|
||||
/**
|
||||
* Creates an long out of a bitset
|
||||
*
|
||||
* @param bitSet the bitset
|
||||
* @return a long from the bitset representation
|
||||
*/
|
||||
public static long longFrom(final BitSet bitSet) {
|
||||
return longFrom(bitSet, NBITS_LONG_REPRESENTATION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a short integer from a bitset
|
||||
*
|
||||
* @param bitSet the bitset
|
||||
* @return a short from the bitset representation
|
||||
*/
|
||||
public static short shortFrom(final BitSet bitSet) {
|
||||
return (short) longFrom(bitSet, NBITS_SHORT_REPRESENTATION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cretes an integer with any number of bits (up to 64 -- long precision) from a bitset
|
||||
*
|
||||
* @param bitSet the bitset
|
||||
* @param nBits the number of bits to be used for this representation
|
||||
* @return an integer with nBits from the bitset representation
|
||||
*/
|
||||
public static long longFrom(final BitSet bitSet, final int nBits) {
|
||||
long number = 0;
|
||||
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0 && bitIndex <= nBits; bitIndex = bitSet.nextSetBit(bitIndex + 1))
|
||||
number |= 1L << bitIndex;
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BitSet representation of a given long
|
||||
*
|
||||
* @param number the number to turn into a bitset
|
||||
* @return a bitset representation of the long
|
||||
*/
|
||||
public static BitSet bitSetFrom(long number) {
|
||||
return bitSetFrom(number, NBITS_LONG_REPRESENTATION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BitSet representation of a given short
|
||||
*
|
||||
* @param number the number to turn into a bitset
|
||||
* @return a bitset representation of the short
|
||||
*/
|
||||
public static BitSet bitSetFrom(short number) {
|
||||
return bitSetFrom(number, NBITS_SHORT_REPRESENTATION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision)
|
||||
*
|
||||
* @param number the number to turn into a bitset
|
||||
* @param nBits the number of bits to use as precision for this conversion
|
||||
* @return a bitset representation of the integer
|
||||
*/
|
||||
public static BitSet bitSetFrom(long number, int nBits) {
|
||||
BitSet bitSet = new BitSet();
|
||||
boolean isNegative = number < 0;
|
||||
int bitIndex = 0;
|
||||
while (number != 0) {
|
||||
if (number % 2 != 0)
|
||||
bitSet.set(bitIndex);
|
||||
bitIndex++;
|
||||
number /= 2;
|
||||
}
|
||||
if (isNegative) {
|
||||
boolean foundFirstSetBit = false;
|
||||
for (int i = bitSet.nextSetBit(0); i < nBits && i >= 0; i++) {
|
||||
boolean bit = bitSet.get(i);
|
||||
if (!foundFirstSetBit && bit)
|
||||
foundFirstSetBit = true; // maintain all bits until the first 1 is found (inclusive)
|
||||
else if (foundFirstSetBit)
|
||||
bitSet.flip(i); // flip every other bit up to NBITS_REPRESENTATION
|
||||
}
|
||||
}
|
||||
return bitSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a BitSet into the dna string representation.
|
||||
*
|
||||
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
||||
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
||||
* a bitSetFrom(BigNumber) method.
|
||||
*
|
||||
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
|
||||
* base_10 representation of the sequence. This is important for us to know how to bring the number
|
||||
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
|
||||
* as 0's and leading 0's are omitted).
|
||||
*
|
||||
* quasi-canonical because A is represented by a 0, therefore,
|
||||
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
|
||||
* we have : 0, 1, 2, 3, 00, 01, 02, ...
|
||||
*
|
||||
* but we can correctly decode it because we know the final length.
|
||||
*
|
||||
* @param bitSet the bitset representation of the dna sequence
|
||||
* @return the dna sequence represented by the bitset
|
||||
*/
|
||||
public static String dnaFrom(final BitSet bitSet) {
|
||||
long number = longFrom(bitSet); // the base_10 representation of the bit set
|
||||
if (number < 0)
|
||||
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
|
||||
|
||||
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
||||
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
||||
|
||||
String dna = "";
|
||||
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
||||
byte base = (byte) (number % 4);
|
||||
switch (base) {
|
||||
case 0:
|
||||
dna = "A" + dna;
|
||||
break;
|
||||
case 1:
|
||||
dna = "C" + dna;
|
||||
break;
|
||||
case 2:
|
||||
dna = "G" + dna;
|
||||
break;
|
||||
case 3:
|
||||
dna = "T" + dna;
|
||||
break;
|
||||
}
|
||||
number /= 4;
|
||||
}
|
||||
for (int j = dna.length(); j < length; j++)
|
||||
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
||||
|
||||
return dna;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BitSet representation of a given dna string.
|
||||
*
|
||||
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
||||
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
||||
* a bitSetFrom(BigNumber) method.
|
||||
*
|
||||
* The bit representation of a dna string is the simple:
|
||||
* 0 A 4 AA 8 CA
|
||||
* 1 C 5 AC ...
|
||||
* 2 G 6 AG 1343 TTGGT
|
||||
* 3 T 7 AT 1364 TTTTT
|
||||
*
|
||||
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
|
||||
* preceded the string (with smaller lengths).
|
||||
*
|
||||
* @param dna the dna sequence
|
||||
* @return the bitset representing the dna sequence
|
||||
*/
|
||||
public static BitSet bitSetFrom(String dna) {
|
||||
if (dna.length() > MAX_DNA_CONTEXT)
|
||||
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
|
||||
|
||||
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
||||
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
|
||||
for (int i = 0; i < dna.length(); i++) {
|
||||
baseTen *= 4;
|
||||
switch (dna.charAt(i)) {
|
||||
case 'A':
|
||||
baseTen += 0;
|
||||
break;
|
||||
case 'C':
|
||||
baseTen += 1;
|
||||
break;
|
||||
case 'G':
|
||||
baseTen += 2;
|
||||
break;
|
||||
case 'T':
|
||||
baseTen += 3;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of bits necessary to represent a given number of elements
|
||||
*
|
||||
* @param numberOfElements the number of elements to represent (must be positive)
|
||||
* @return the number of bits necessary to represent this many elements
|
||||
*/
|
||||
public static int numberOfBitsToRepresent(long numberOfElements) {
|
||||
if (numberOfElements < 0)
|
||||
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
|
||||
|
||||
if (numberOfElements == 1L)
|
||||
return 1; // special case
|
||||
|
||||
int n = 0;
|
||||
numberOfElements--;
|
||||
while (numberOfElements > 0) {
|
||||
numberOfElements = numberOfElements >> 1;
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the length of the DNA context for a given base 10 number
|
||||
*
|
||||
* It is important to know the length given the base 10 number to calculate the number of combinations
|
||||
* and to disambiguate the "quasi-canonical" state.
|
||||
*
|
||||
* This method also calculates the number of combinations as a by-product, but since it memoizes the
|
||||
* results, a subsequent call to combinationsFor(length) is O(1).
|
||||
*
|
||||
* @param number the base 10 representation of the bitset
|
||||
* @return the length of the DNA context represented by this number
|
||||
*/
|
||||
private static int contextLengthFor(long number) {
|
||||
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
|
||||
long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it).
|
||||
while (combinations <= number) { // find the length of the dna string (length)
|
||||
length++;
|
||||
combinations = combinationsFor(length); // calculate the next context
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* The sum of all combinations of a context of a given length from length = 0 to length.
|
||||
*
|
||||
* Memoized implementation of sum(4^i) , where i=[0,length]
|
||||
*
|
||||
* @param length the length of the DNA context
|
||||
* @return the sum of all combinations leading up to this context length.
|
||||
*/
|
||||
private static long combinationsFor(int length) {
|
||||
if (length > MAX_DNA_CONTEXT)
|
||||
throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length));
|
||||
|
||||
// only calculate the number of combinations if the table hasn't already cached the value
|
||||
if (length > 0 && combinationsPerLength[length] == 0) {
|
||||
long combinations = 0L;
|
||||
for (int i = 1; i <= length; i++)
|
||||
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
|
||||
combinationsPerLength[length] = combinations;
|
||||
}
|
||||
return combinationsPerLength[length];
|
||||
}
|
||||
|
||||
|
||||
public static byte[] sizeOf(Object obj) throws java.io.IOException
|
||||
{
|
||||
ByteArrayOutputStream byteObject = new ByteArrayOutputStream();
|
||||
ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject);
|
||||
objectOutputStream.writeObject(obj);
|
||||
objectOutputStream.flush();
|
||||
objectOutputStream.close();
|
||||
byteObject.close();
|
||||
|
||||
return byteObject.toByteArray();
|
||||
}
|
||||
}
|
||||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
|
|
@ -32,16 +33,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class Haplotype {
|
||||
protected final byte[] bases;
|
||||
protected final double[] quals;
|
||||
private GenomeLoc genomeLocation = null;
|
||||
private boolean isReference = false;
|
||||
private HashMap<String, double[]> readLikelihoodsPerSample = null;
|
||||
private boolean isRef = false;
|
||||
|
||||
/**
|
||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
||||
|
|
@ -69,16 +68,35 @@ public class Haplotype {
|
|||
this.genomeLocation = loc;
|
||||
}
|
||||
|
||||
public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) {
|
||||
this(bases, loc);
|
||||
this.isReference = isRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals( Object h ) {
|
||||
return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases);
|
||||
}
|
||||
|
||||
public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) {
|
||||
if( readLikelihoodsPerSample == null ) {
|
||||
readLikelihoodsPerSample = new HashMap<String, double[]>();
|
||||
}
|
||||
readLikelihoodsPerSample.put(sample, readLikelihoods);
|
||||
}
|
||||
|
||||
@Ensures({"result != null"})
|
||||
public double[] getReadLikelihoods( final String sample ) {
|
||||
return readLikelihoodsPerSample.get(sample);
|
||||
}
|
||||
|
||||
public Set<String> getSampleKeySet() {
|
||||
return readLikelihoodsPerSample.keySet();
|
||||
}
|
||||
|
||||
public boolean isReference() {
|
||||
return isRef;
|
||||
}
|
||||
|
||||
public void setIsReference( boolean isRef ) {
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public double getQualitySum() {
|
||||
double s = 0;
|
||||
for (int k=0; k < bases.length; k++) {
|
||||
|
|
@ -87,6 +105,7 @@ public class Haplotype {
|
|||
return s;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String returnString = "";
|
||||
for(int iii = 0; iii < bases.length; iii++) {
|
||||
|
|
@ -110,10 +129,6 @@ public class Haplotype {
|
|||
return genomeLocation.getStop();
|
||||
}
|
||||
|
||||
public boolean isReference() {
|
||||
return isReference;
|
||||
}
|
||||
|
||||
@Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"})
|
||||
public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) {
|
||||
|
||||
|
|
@ -208,13 +223,14 @@ public class Haplotype {
|
|||
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
|
||||
haplotypeString = haplotypeString.substring(0,haplotypeSize);
|
||||
|
||||
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
|
||||
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus));
|
||||
|
||||
}
|
||||
|
||||
return haplotypeMap;
|
||||
}
|
||||
|
||||
// BUGBUG: copied from ReadClipper and slightly modified since we don't have the data in a GATKSAMRecord
|
||||
private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) {
|
||||
int readBases = 0;
|
||||
int refBases = 0;
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
|
|||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
|
|
@ -1527,124 +1526,4 @@ public class MathUtils {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an integer out of a bitset
|
||||
*
|
||||
* @param bitSet the bitset
|
||||
* @return an integer with the bitset representation
|
||||
*/
|
||||
public static long intFrom(final BitSet bitSet) {
|
||||
long number = 0;
|
||||
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1))
|
||||
number |= 1L << bitIndex;
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BitSet representation of a given integer
|
||||
*
|
||||
* @param number the number to turn into a bitset
|
||||
* @return a bitset representation of the integer
|
||||
*/
|
||||
public static BitSet bitSetFrom(long number) {
|
||||
BitSet bitSet = new BitSet();
|
||||
int bitIndex = 0;
|
||||
while (number > 0) {
|
||||
if (number%2 > 0)
|
||||
bitSet.set(bitIndex);
|
||||
bitIndex++;
|
||||
number /= 2;
|
||||
}
|
||||
return bitSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a BitSet into the dna string representation.
|
||||
*
|
||||
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
||||
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
||||
* a bitSetFrom(BigNumber) method.
|
||||
*
|
||||
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
|
||||
* base_10 representation of the sequence. This is important for us to know how to bring the number
|
||||
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
|
||||
* as 0's and leading 0's are omitted).
|
||||
*
|
||||
* quasi-canonical because A is represented by a 0, therefore,
|
||||
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
|
||||
* we have : 0, 1, 2, 3, 00, 01, 02, ...
|
||||
*
|
||||
* but we can correctly decode it because we know the final length.
|
||||
*
|
||||
* @param bitSet the bitset representation of the dna sequence
|
||||
* @return the dna sequence represented by the bitset
|
||||
*/
|
||||
public static String dnaFrom(final BitSet bitSet) {
|
||||
long number = intFrom(bitSet); // the base_10 representation of the bit set
|
||||
long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later)
|
||||
long nextContext = 4; // the next context (we advance it so we know which one was preceding it).
|
||||
int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
|
||||
while (nextContext <= number) { // find the length of the dna string (i)
|
||||
preContext = nextContext; // keep track of the number of combinations in the preceding context
|
||||
nextContext += Math.pow(4, ++i);// calculate the next context
|
||||
}
|
||||
number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
||||
|
||||
String dna = "";
|
||||
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
||||
byte base = (byte) (number % 4);
|
||||
switch (base) {
|
||||
case 0 : dna = "A" + dna; break;
|
||||
case 1 : dna = "C" + dna; break;
|
||||
case 2 : dna = "G" + dna; break;
|
||||
case 3 : dna = "T" + dna; break;
|
||||
}
|
||||
number /= 4;
|
||||
}
|
||||
for (int j = dna.length(); j < i; j++)
|
||||
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
||||
|
||||
return dna;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a BitSet representation of a given dna string.
|
||||
*
|
||||
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
||||
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
||||
* a bitSetFrom(BigNumber) method.
|
||||
*
|
||||
* The bit representation of a dna string is the simple:
|
||||
* 0 A 4 AA 8 CA
|
||||
* 1 C 5 AC ...
|
||||
* 2 G 6 AG 1343 TTGGT
|
||||
* 3 T 7 AT 1364 TTTTT
|
||||
*
|
||||
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
|
||||
* preceded the string (with smaller lengths).
|
||||
*
|
||||
* @param dna the dna sequence
|
||||
* @return the bitset representing the dna sequence
|
||||
*/
|
||||
public static BitSet bitSetFrom(String dna) {
|
||||
if (dna.length() > 31)
|
||||
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length()));
|
||||
|
||||
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
||||
long preContext = 0; // the sum of all combinations that preceded the length of the dna string
|
||||
for (int i=0; i<dna.length(); i++) {
|
||||
baseTen *= 4;
|
||||
switch(dna.charAt(i)) {
|
||||
case 'A': baseTen += 0; break;
|
||||
case 'C': baseTen += 1; break;
|
||||
case 'G': baseTen += 2; break;
|
||||
case 'T': baseTen += 3; break;
|
||||
}
|
||||
if (i>0)
|
||||
preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...)
|
||||
}
|
||||
|
||||
return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,6 +22,16 @@ public class QualityUtils {
|
|||
for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i);
|
||||
}
|
||||
|
||||
private static double qualToErrorProbLog10Cache[] = new double[256];
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) qualToErrorProbLog10Cache[i] = qualToErrorProbLog10Raw(i);
|
||||
}
|
||||
|
||||
private static double qualToProbLog10Cache[] = new double[256];
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) qualToProbLog10Cache[i] = qualToProbLog10Raw(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Private constructor. No instantiating this class!
|
||||
*/
|
||||
|
|
@ -31,7 +41,7 @@ public class QualityUtils {
|
|||
* Convert a quality score to a probability. This is the Phred-style
|
||||
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
||||
*
|
||||
* @param qual a quality score (0-40)
|
||||
* @param qual a quality score (0-255)
|
||||
* @return a probability (0.0-1.0)
|
||||
*/
|
||||
static public double qualToProb(byte qual) {
|
||||
|
|
@ -42,6 +52,14 @@ public class QualityUtils {
|
|||
return 1.0 - Math.pow(10.0, qual/(-10.0));
|
||||
}
|
||||
|
||||
static private double qualToProbLog10Raw(int qual) {
|
||||
return Math.log10(1.0 - qualToErrorProbRaw(qual));
|
||||
}
|
||||
|
||||
static public double qualToProbLog10(byte qual) {
|
||||
return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a quality score to a probability of error. This is the Phred-style
|
||||
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
||||
|
|
@ -57,14 +75,14 @@ public class QualityUtils {
|
|||
return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||
}
|
||||
|
||||
static public double[] qualArrayToLog10ErrorProb(byte[] quals) {
|
||||
double[] returnArray = new double[quals.length];
|
||||
for( int iii = 0; iii < quals.length; iii++ ) {
|
||||
returnArray[iii] = ((double) quals[iii])/-10.0;
|
||||
}
|
||||
return returnArray;
|
||||
static private double qualToErrorProbLog10Raw(int qual) {
|
||||
return ((double) qual)/-10.0;
|
||||
}
|
||||
|
||||
|
||||
static public double qualToErrorProbLog10(byte qual) {
|
||||
return qualToErrorProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a probability to a quality score. Note, this is capped at Q40.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -25,9 +25,14 @@
|
|||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.util.*;
|
||||
|
|
@ -668,4 +673,34 @@ public class Utils {
|
|||
array[i] = value;
|
||||
}
|
||||
|
||||
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
|
||||
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
|
||||
|
||||
SAMFileHeader header = toolkit.getSAMFileHeader();
|
||||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||
for ( SAMProgramRecord record : oldRecords )
|
||||
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS )
|
||||
newRecords.add(record);
|
||||
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
|
||||
writer.writeHeader(header);
|
||||
writer.setPresorted(preSorted);
|
||||
}
|
||||
|
||||
public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) {
|
||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||
try {
|
||||
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
||||
programRecord.setProgramVersion(version);
|
||||
} catch (MissingResourceException e) {
|
||||
// couldn't care less if the resource is missing...
|
||||
}
|
||||
programRecord.setCommandLine(toolkit.createApproximateCommandLineArgumentString(toolkit, walker));
|
||||
return programRecord;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,11 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
fullExtentReferenceLoc = extendedLoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActiveRegion " + activeRegionLoc.toString();
|
||||
}
|
||||
|
||||
// add each read to the bin and extend the reference genome activeRegionLoc if needed
|
||||
public void add( final GATKSAMRecord read ) {
|
||||
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
|
||||
|
|
@ -78,4 +83,13 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
public void clearReads() { reads.clear(); }
|
||||
public void remove( final GATKSAMRecord read ) { reads.remove( read ); }
|
||||
public void removeAll( final ArrayList<GATKSAMRecord> readsToRemove ) { reads.removeAll( readsToRemove ); }
|
||||
|
||||
public boolean equalExceptReads(final ActiveRegion other) {
|
||||
if ( ! activeRegionLoc.equals(other.activeRegionLoc)) return false;
|
||||
if ( isActive != other.isActive ) return false;
|
||||
if ( genomeLocParser != other.genomeLocParser ) return false;
|
||||
if ( extension != other.extension ) return false;
|
||||
if ( ! extendedLoc.equals(other.extendedLoc) ) return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Class holding information about per-base activity scores for the
|
||||
* active region traversal
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since Date created
|
||||
*/
|
||||
public class ActivityProfile {
|
||||
final GenomeLocParser parser;
|
||||
final boolean presetRegions;
|
||||
GenomeLoc regionStartLoc = null;
|
||||
final List<Double> isActiveList;
|
||||
|
||||
private GenomeLoc lastLoc = null;
|
||||
|
||||
// todo -- add upfront the start and stop of the intervals
|
||||
// todo -- check that no regions are unexpectedly missing
|
||||
// todo -- add unit tests
|
||||
// TODO -- own preset regions
|
||||
public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) {
|
||||
this(parser, presetRegions, new ArrayList<Double>(), null);
|
||||
}
|
||||
|
||||
protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List<Double> isActiveList, final GenomeLoc regionStartLoc) {
|
||||
this.parser = parser;
|
||||
this.presetRegions = presetRegions;
|
||||
this.isActiveList = isActiveList;
|
||||
this.regionStartLoc = regionStartLoc;
|
||||
}
|
||||
|
||||
public void add(final GenomeLoc loc, final double score) {
|
||||
if ( loc.size() != 1 )
|
||||
throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" );
|
||||
if ( lastLoc != null && loc.getStart() != lastLoc.getStop() + 1 )
|
||||
throw new ReviewedStingException("Bad add call to ActivityProfile: lastLoc added " + lastLoc + " and next is " + loc);
|
||||
isActiveList.add(score);
|
||||
if( regionStartLoc == null ) {
|
||||
regionStartLoc = loc;
|
||||
}
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return isActiveList.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Band pass this ActivityProfile, producing a new profile that's band pass filtered
|
||||
* @return a new ActivityProfile that's the band-pass filtered version of this profile
|
||||
*/
|
||||
public ActivityProfile bandPassFilter() {
|
||||
final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
|
||||
final Double[] filteredProbArray = new Double[activeProbArray.length];
|
||||
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author
|
||||
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||
double maxVal = 0;
|
||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
||||
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
||||
}
|
||||
filteredProbArray[iii] = maxVal;
|
||||
}
|
||||
|
||||
return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Partition this profile into active regions
|
||||
* @param activeRegionExtension
|
||||
* @return
|
||||
*/
|
||||
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension ) {
|
||||
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author
|
||||
final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author
|
||||
|
||||
if( isActiveList.size() == 0 ) {
|
||||
// no elements in the active list, just return an empty one
|
||||
return Collections.emptyList();
|
||||
} else if( isActiveList.size() == 1 ) {
|
||||
// there's a single element, it's either active or inactive
|
||||
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
||||
final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension );
|
||||
return Collections.singletonList(region);
|
||||
} else {
|
||||
// there are 2+ elements, divide these up into regions
|
||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
||||
int curStart = 0;
|
||||
for(int iii = 1; iii < isActiveList.size(); iii++ ) {
|
||||
final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
|
||||
if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
|
||||
returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) );
|
||||
isActive = thisStatus;
|
||||
curStart = iii;
|
||||
}
|
||||
}
|
||||
returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region
|
||||
|
||||
return returnList;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper routine to create an active region based on our current start and end offsets
|
||||
* @param isActive should the region be active?
|
||||
* @param curStart offset (0-based) from the start of this region
|
||||
* @param curEnd offset (0-based) from the start of this region
|
||||
* @param activeRegionExtension
|
||||
* @return a fully initialized ActiveRegion with the above properties
|
||||
*/
|
||||
private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) {
|
||||
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
|
||||
return new ActiveRegion( loc, isActive, parser, activeRegionExtension );
|
||||
}
|
||||
}
|
||||
|
|
@ -4,7 +4,7 @@ import com.google.java.contract.Requires;
|
|||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -320,8 +320,8 @@ public class ClippingOp {
|
|||
byte[] newBaseDeletionQuals = new byte[newLength];
|
||||
System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength);
|
||||
System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength);
|
||||
hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION);
|
||||
hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION);
|
||||
hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION);
|
||||
hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION);
|
||||
}
|
||||
|
||||
return hardClippedRead;
|
||||
|
|
|
|||
|
|
@ -231,15 +231,16 @@ public class ReadClipper {
|
|||
|
||||
|
||||
/**
|
||||
* Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual.
|
||||
* Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the desired algorithm.
|
||||
*
|
||||
* This function will look for low quality tails and hard clip them away. A low quality tail
|
||||
* ends when a base has base quality greater than lowQual.
|
||||
*
|
||||
* @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...)
|
||||
* @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped
|
||||
* @return a new read without low quality tails
|
||||
*/
|
||||
private GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
|
||||
private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) {
|
||||
if (read.isEmpty())
|
||||
return read;
|
||||
|
||||
|
|
@ -254,7 +255,6 @@ public class ReadClipper {
|
|||
// if the entire read should be clipped, then return an empty read.
|
||||
if (leftClipIndex > rightClipIndex)
|
||||
return GATKSAMRecord.emptyRead(read);
|
||||
// return (new GATKSAMRecord(read.getHeader()));
|
||||
|
||||
if (rightClipIndex < read.getReadLength() - 1) {
|
||||
this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1));
|
||||
|
|
@ -262,11 +262,18 @@ public class ReadClipper {
|
|||
if (leftClipIndex > 0 ) {
|
||||
this.addOp(new ClippingOp(0, leftClipIndex - 1));
|
||||
}
|
||||
return this.clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
return this.clipRead(algorithm);
|
||||
}
|
||||
|
||||
private GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
|
||||
return this.clipLowQualEnds(ClippingRepresentation.HARDCLIP_BASES, lowQual);
|
||||
}
|
||||
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
|
||||
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
|
||||
}
|
||||
public static GATKSAMRecord clipLowQualEnds(GATKSAMRecord read, byte lowQual, ClippingRepresentation algorithm) {
|
||||
return (new ReadClipper(read)).clipLowQualEnds(algorithm, lowQual);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -154,18 +154,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
|
||||
|
||||
} else {
|
||||
if ( str.startsWith("##INFO=") ) {
|
||||
VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
|
||||
if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) {
|
||||
final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
|
||||
metaData.add(info);
|
||||
infoFields.put(info.getName(), info.getType());
|
||||
} else if ( str.startsWith("##FILTER=") ) {
|
||||
VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version);
|
||||
infoFields.put(info.getID(), info.getType());
|
||||
} else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) {
|
||||
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version);
|
||||
metaData.add(filter);
|
||||
filterFields.add(filter.getName());
|
||||
} else if ( str.startsWith("##FORMAT=") ) {
|
||||
VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version);
|
||||
filterFields.add(filter.getID());
|
||||
} else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) {
|
||||
final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version);
|
||||
metaData.add(format);
|
||||
formatFields.put(format.getName(), format.getType());
|
||||
formatFields.put(format.getID(), format.getType());
|
||||
} else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) {
|
||||
final VCFSimpleHeaderLine contig = new VCFSimpleHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), null);
|
||||
metaData.add(contig);
|
||||
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
|
||||
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"));
|
||||
metaData.add(alt);
|
||||
} else {
|
||||
int equals = str.indexOf("=");
|
||||
if ( equals != -1 )
|
||||
|
|
|
|||
|
|
@ -1,28 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for ALT fields in the VCF header
|
||||
*/
|
||||
public class VCFAltHeaderLine extends VCFSimpleHeaderLine {
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFAltHeaderLine(String name, String description) {
|
||||
super(name, description, SupportedHeaderLineType.ALT);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
*/
|
||||
protected VCFAltHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.ALT);
|
||||
}
|
||||
}
|
||||
|
|
@ -34,7 +34,7 @@ import java.util.Map;
|
|||
/**
|
||||
* a base class for compound header lines, which include info lines and format lines (so far)
|
||||
*/
|
||||
public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
||||
public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
|
||||
public enum SupportedHeaderLineType {
|
||||
INFO(true), FORMAT(false);
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
|||
private VCFHeaderLineType type;
|
||||
|
||||
// access methods
|
||||
public String getName() { return name; }
|
||||
public String getID() { return name; }
|
||||
public String getDescription() { return description; }
|
||||
public VCFHeaderLineType getType() { return type; }
|
||||
public VCFHeaderLineCount getCountType() { return countType; }
|
||||
|
|
|
|||
|
|
@ -80,6 +80,13 @@ public final class VCFConstants {
|
|||
public static final String PHASED_SWITCH_PROB_v3 = "\\";
|
||||
public static final String PHASING_TOKENS = "/|\\";
|
||||
|
||||
// header lines
|
||||
public static final String FILTER_HEADER_START = "##FILTER";
|
||||
public static final String FORMAT_HEADER_START = "##FORMAT";
|
||||
public static final String INFO_HEADER_START = "##INFO";
|
||||
public static final String ALT_HEADER_START = "##ALT";
|
||||
public static final String CONTIG_HEADER_START = "##contig";
|
||||
|
||||
// old indel alleles
|
||||
public static final char DELETION_ALLELE_v3 = 'D';
|
||||
public static final char INSERTION_ALLELE_v3 = 'I';
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for FILTER fields in the VCF header
|
||||
|
|
@ -13,7 +15,7 @@ public class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
|
|||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFFilterHeaderLine(String name, String description) {
|
||||
super(name, description, SupportedHeaderLineType.FILTER);
|
||||
super("FILTER", name, description);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -23,6 +25,6 @@ public class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
|
|||
* @param version the vcf header version
|
||||
*/
|
||||
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.FILTER);
|
||||
super(line, version, "FILTER", Arrays.asList("ID", "Description"));
|
||||
}
|
||||
}
|
||||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
|||
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -126,11 +125,11 @@ public class VCFHeader {
|
|||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
||||
mInfoMetaData.put(infoLine.getName(), infoLine);
|
||||
mInfoMetaData.put(infoLine.getID(), infoLine);
|
||||
}
|
||||
else if ( line instanceof VCFFormatHeaderLine ) {
|
||||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||
mFormatMetaData.put(formatLine.getName(), formatLine);
|
||||
mFormatMetaData.put(formatLine.getID(), formatLine);
|
||||
}
|
||||
else {
|
||||
mOtherMetaData.put(line.getKey(), line);
|
||||
|
|
|
|||
|
|
@ -73,10 +73,14 @@ class VCF4Parser implements VCFLineParser {
|
|||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
||||
for (String str : ret.keySet()) {
|
||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
if ( expectedTagOrder != null ) {
|
||||
if ( ret.size() > expectedTagOrder.size() )
|
||||
throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
||||
for ( String str : ret.keySet() ) {
|
||||
if ( !expectedTagOrder.get(index).equals(str) )
|
||||
throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
/** an interface for named header lines **/
|
||||
public interface VCFNamedHeaderLine {
|
||||
String getName();
|
||||
/** an interface for ID-based header lines **/
|
||||
public interface VCFIDHeaderLine {
|
||||
String getID();
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
|
|
@ -9,34 +9,35 @@ import java.util.Map;
|
|||
* @author ebanks
|
||||
* A class representing a key=value entry for simple VCF header types
|
||||
*/
|
||||
public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
||||
|
||||
public enum SupportedHeaderLineType {
|
||||
FILTER, ALT;
|
||||
}
|
||||
public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
|
||||
|
||||
private String name;
|
||||
private String description;
|
||||
|
||||
// our type of line, i.e. filter, alt, etc
|
||||
private final SupportedHeaderLineType lineType;
|
||||
|
||||
private Map<String, String> genericFields = new LinkedHashMap<String, String>();
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
* @param lineType the header line type
|
||||
* @param key the key for this header line
|
||||
* @param name the name for this header line
|
||||
* @param genericFields other fields for this header line
|
||||
*/
|
||||
public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.lineType = lineType;
|
||||
this.name = name;
|
||||
this.description = description;
|
||||
public VCFSimpleHeaderLine(String key, String name, Map<String, String> genericFields) {
|
||||
super(key, "");
|
||||
initialize(name, genericFields);
|
||||
}
|
||||
|
||||
if ( name == null || description == null )
|
||||
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description ));
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param name the name for this header line
|
||||
* @param description description for this header line
|
||||
*/
|
||||
public VCFSimpleHeaderLine(String key, String name, String description) {
|
||||
super(key, "");
|
||||
Map<String, String> map = new LinkedHashMap<String, String>(1);
|
||||
map.put("Description", description);
|
||||
initialize(name, map);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -44,38 +45,50 @@ public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNa
|
|||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
* @param lineType the header line type
|
||||
* @param key the key for this header line
|
||||
* @param expectedTagOrdering the tag ordering expected for this header line
|
||||
*/
|
||||
protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.lineType = lineType;
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
||||
protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, String key, List<String> expectedTagOrdering) {
|
||||
super(key, "");
|
||||
Map<String, String> mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering);
|
||||
name = mapping.get("ID");
|
||||
description = mapping.get("Description");
|
||||
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
|
||||
description = UNBOUND_DESCRIPTION;
|
||||
initialize(name, mapping);
|
||||
}
|
||||
|
||||
protected void initialize(String name, Map<String, String> genericFields) {
|
||||
if ( name == null || genericFields == null || genericFields.isEmpty() )
|
||||
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name));
|
||||
|
||||
this.name = name;
|
||||
this.genericFields.putAll(genericFields);
|
||||
}
|
||||
|
||||
protected String toStringEncoding() {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
Map<String, Object> map = new LinkedHashMap<String, Object>();
|
||||
map.put("ID", name);
|
||||
map.put("Description", description);
|
||||
return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map);
|
||||
map.putAll(genericFields);
|
||||
return getKey() + "=" + VCFHeaderLine.toStringEncoding(map);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFSimpleHeaderLine) )
|
||||
return false;
|
||||
VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o;
|
||||
return name.equals(other.name) &&
|
||||
description.equals(other.description);
|
||||
if ( !name.equals(other.name) || genericFields.size() != other.genericFields.size() )
|
||||
return false;
|
||||
for ( Map.Entry<String, String> entry : genericFields.entrySet() ) {
|
||||
if ( !entry.getValue().equals(other.genericFields.get(entry.getKey())) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
public String getID() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
public Map<String, String> getGenericFields() {
|
||||
return genericFields;
|
||||
}
|
||||
}
|
||||
|
|
@ -155,10 +155,10 @@ public class VCFUtils {
|
|||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||
String key = line.getKey();
|
||||
|
||||
if ( line instanceof VCFNamedHeaderLine)
|
||||
key = key + "" + ((VCFNamedHeaderLine) line).getName();
|
||||
String key = line.getKey();
|
||||
if ( line instanceof VCFIDHeaderLine )
|
||||
key = key + "-" + ((VCFIDHeaderLine)line).getID();
|
||||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
|
|
@ -166,8 +166,8 @@ public class VCFUtils {
|
|||
continue;
|
||||
else if ( ! line.getClass().equals(other.getClass()) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
else if ( line instanceof VCFFilterHeaderLine) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName();
|
||||
else if ( line instanceof VCFFilterHeaderLine ) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getID(); String otherName = ((VCFFilterHeaderLine) other).getID();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -203,8 +203,8 @@ public class FragmentUtils {
|
|||
insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop];
|
||||
deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop];
|
||||
}
|
||||
returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION );
|
||||
returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION );
|
||||
returnRead.setBaseQualities( insertionQuals, EventType.BASE_INSERTION );
|
||||
returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION );
|
||||
}
|
||||
|
||||
final ArrayList<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>();
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
for (int i = 0; i < reads.size(); i++) {
|
||||
GATKSAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||
pileup.add(createNewPileupElement(read, offset, false, false, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||
}
|
||||
|
||||
return pileup;
|
||||
|
|
@ -196,7 +196,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
|
||||
UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>();
|
||||
for (GATKSAMRecord read : reads) {
|
||||
pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||
pileup.add(createNewPileupElement(read, offset, false, false, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||
}
|
||||
|
||||
return pileup;
|
||||
|
|
@ -204,8 +204,8 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
|||
|
||||
protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker);
|
||||
|
||||
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip);
|
||||
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip, String nextEventBases, int nextEventLength );
|
||||
protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip);
|
||||
protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength );
|
||||
|
||||
// --------------------------------------------------------
|
||||
//
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement {
|
|||
|
||||
|
||||
public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) {
|
||||
super(read, offset, type == Type.DELETION, false, false, false,null,-1); // extended events are slated for removal
|
||||
super(read, offset, type == Type.DELETION, false, false, false, false, false, null, -1); // extended events are slated for removal
|
||||
this.read = read;
|
||||
this.offset = offset;
|
||||
this.eventLength = eventLength;
|
||||
|
|
|
|||
|
|
@ -21,15 +21,17 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89;
|
||||
public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90;
|
||||
|
||||
protected final GATKSAMRecord read;
|
||||
protected final int offset;
|
||||
protected final boolean isDeletion;
|
||||
protected final boolean isBeforeDeletion;
|
||||
protected final boolean isBeforeInsertion;
|
||||
protected final boolean isNextToSoftClip;
|
||||
protected final int eventLength;
|
||||
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases
|
||||
// in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
||||
protected final GATKSAMRecord read; // the read this base belongs to
|
||||
protected final int offset; // the offset in the bases array for this base
|
||||
protected final boolean isDeletion; // is this base a deletion
|
||||
protected final boolean isBeforeDeletion; // is the base to the right of this base an deletion
|
||||
protected final boolean isAfterDeletion; // is the base to the left of this base a deletion
|
||||
protected final boolean isBeforeInsertion; // is the base to the right of this base an insertion
|
||||
protected final boolean isAfterInsertion; // is the base to the left of this base an insertion
|
||||
protected final boolean isNextToSoftClip; // is this base either before or after a soft clipped base
|
||||
protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base
|
||||
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -39,7 +41,9 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
* @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions)
|
||||
* @param isDeletion whether or not this base is a deletion
|
||||
* @param isBeforeDeletion whether or not this base is before a deletion
|
||||
* @param isAfterDeletion whether or not this base is after a deletion
|
||||
* @param isBeforeInsertion whether or not this base is before an insertion
|
||||
* @param isAfterInsertion whether or not this base is after an insertion
|
||||
* @param isNextToSoftClip whether or not this base is next to a soft clipped base
|
||||
* @param nextEventBases bases in event in case element comes before insertion or deletion
|
||||
* @param nextEventLength length of next event in case it's insertion or deletion
|
||||
|
|
@ -48,8 +52,7 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
"read != null",
|
||||
"offset >= -1",
|
||||
"offset <= read.getReadLength()"})
|
||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip,
|
||||
final String nextEventBases, final int nextEventLength) {
|
||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength) {
|
||||
if (offset < 0 && isDeletion)
|
||||
throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset");
|
||||
|
||||
|
|
@ -57,20 +60,22 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
this.offset = offset;
|
||||
this.isDeletion = isDeletion;
|
||||
this.isBeforeDeletion = isBeforeDeletion;
|
||||
this.isAfterDeletion = isAfterDeletion;
|
||||
this.isBeforeInsertion = isBeforeInsertion;
|
||||
this.isAfterInsertion = isAfterInsertion;
|
||||
this.isNextToSoftClip = isNextToSoftClip;
|
||||
if (isBeforeInsertion)
|
||||
eventBases = nextEventBases;
|
||||
else
|
||||
eventBases = null; // ignore argument in any other case
|
||||
eventBases = null; // ignore argument in any other case
|
||||
if (isBeforeDeletion || isBeforeInsertion)
|
||||
eventLength = nextEventLength;
|
||||
else
|
||||
eventLength = -1;
|
||||
}
|
||||
|
||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) {
|
||||
this(read,offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null, -1);
|
||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) {
|
||||
this(read,offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, -1);
|
||||
}
|
||||
public boolean isDeletion() {
|
||||
return isDeletion;
|
||||
|
|
@ -80,10 +85,18 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
return isBeforeDeletion;
|
||||
}
|
||||
|
||||
public boolean isAfterDeletion() {
|
||||
return isAfterDeletion;
|
||||
}
|
||||
|
||||
public boolean isBeforeInsertion() {
|
||||
return isBeforeInsertion;
|
||||
}
|
||||
|
||||
public boolean isAfterInsertion() {
|
||||
return isAfterInsertion;
|
||||
}
|
||||
|
||||
public boolean isNextToSoftClip() {
|
||||
return isNextToSoftClip;
|
||||
}
|
||||
|
|
@ -123,14 +136,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns length of the event (number of inserted or deleted bases
|
||||
* @return length of the event (number of inserted or deleted bases
|
||||
*/
|
||||
public int getEventLength() {
|
||||
return eventLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
|
||||
* @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
|
||||
*/
|
||||
public String getEventBases() {
|
||||
return eventBases;
|
||||
|
|
@ -185,13 +198,9 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
//
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// public boolean isReducedRead() {
|
||||
// return read.isReducedRead();
|
||||
// }
|
||||
|
||||
/**
|
||||
* Returns the number of elements in the pileup element.
|
||||
* <p/>
|
||||
*
|
||||
* Unless this is a reduced read, the number of elements in a pileup element is one. In the event of
|
||||
* this being a reduced read and a deletion, we return the average number of elements between the left
|
||||
* and right elements to the deletion. We assume the deletion to be left aligned.
|
||||
|
|
|
|||
|
|
@ -96,12 +96,11 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup<
|
|||
}
|
||||
|
||||
@Override
|
||||
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
|
||||
protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) {
|
||||
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
||||
}
|
||||
@Override
|
||||
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion,
|
||||
boolean isNextToSoftClip,String nextEventBases, int nextEventLength) {
|
||||
protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ) {
|
||||
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -56,6 +56,9 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup<ReadBackedPil
|
|||
*
|
||||
* @param loc
|
||||
* @param pileup
|
||||
* @param size
|
||||
* @param nDeletions
|
||||
* @param nMQ0Reads
|
||||
*/
|
||||
public ReadBackedPileupImpl(GenomeLoc loc, List<PileupElement> pileup, int size, int nDeletions, int nMQ0Reads) {
|
||||
super(loc, pileup, size, nDeletions, nMQ0Reads);
|
||||
|
|
@ -71,13 +74,14 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup<ReadBackedPil
|
|||
}
|
||||
|
||||
@Override
|
||||
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion,
|
||||
boolean isNextToSoftClip) {
|
||||
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null,0);
|
||||
protected PileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) {
|
||||
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, 0);
|
||||
}
|
||||
|
||||
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion,
|
||||
boolean isNextToSoftClip,String nextEventBases, final int nextEventLength) {
|
||||
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, nextEventBases,nextEventLength);
|
||||
@Override
|
||||
protected PileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ) {
|
||||
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, nextEventBases, nextEventLength);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,10 +26,11 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
|
||||
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
|
@ -37,244 +38,334 @@ import org.broadinstitute.sting.utils.text.XReadLines;
|
|||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Utility methods to facilitate on-the-fly base quality score recalibration.
|
||||
*
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 2/4/12
|
||||
*/
|
||||
|
||||
public class BaseRecalibration {
|
||||
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
public static final String EOF_MARKER = "EOF";
|
||||
private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here?
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values.
|
||||
private ArrayList<HashMap<BitSet, RecalDatum>> collapsedHashes = new ArrayList<HashMap<BitSet, RecalDatum>> (); // All the collapsed data tables
|
||||
|
||||
public BaseRecalibration( final File RECAL_FILE ) {
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of all covariates to be used in this calculation
|
||||
private final ArrayList<Covariate> requiredCovariates = new ArrayList<Covariate>(); // List of required covariates to be used in this calculation
|
||||
private final ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>(); // List of optional covariates to be used in this calculation
|
||||
|
||||
public static final Pattern REQUIRED_COVARIATE_PATTERN = Pattern.compile("^# Required Covariates.*");
|
||||
public static final Pattern OPTIONAL_COVARIATE_PATTERN = Pattern.compile("^# Optional Covariates.*");
|
||||
public static final String EOF_MARKER = "EOF";
|
||||
|
||||
private static final byte SMOOTHING_CONSTANT = 1;
|
||||
|
||||
ArrayList<BQSRKeyManager> keyManagers = new ArrayList<BQSRKeyManager>();
|
||||
|
||||
public BaseRecalibration(final File RECAL_FILE) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // todo -- initialize with the parameters from the csv file!
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
|
||||
boolean foundRequiredCovariates = false;
|
||||
boolean foundOptionalCovariates = false;
|
||||
boolean initializedKeyManagers = false;
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
for (String line : new XReadLines(RECAL_FILE)) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
||||
sawEOF = EOF_MARKER.equals(line);
|
||||
if (sawEOF)
|
||||
break;
|
||||
|
||||
boolean requiredCovariatesLine = REQUIRED_COVARIATE_PATTERN.matcher(line).matches();
|
||||
boolean optionalCovariatesLine = OPTIONAL_COVARIATE_PATTERN.matcher(line).matches();
|
||||
|
||||
if (requiredCovariatesLine && foundRequiredCovariates)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate required covariates line");
|
||||
|
||||
if (optionalCovariatesLine && foundOptionalCovariates)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate optional covariates line");
|
||||
|
||||
if (optionalCovariatesLine && !foundRequiredCovariates)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Optional covariates reported before Required covariates");
|
||||
|
||||
if (requiredCovariatesLine || optionalCovariatesLine) {
|
||||
String [] covariateNames = line.split(": ")[1].split(","); // take the second half of the string (past the ":") and split it by "," to get the list of required covariates
|
||||
|
||||
List<Covariate> covariateList = requiredCovariatesLine ? requiredCovariates : optionalCovariates; // set the appropriate covariate list to update
|
||||
|
||||
for (String covariateName : covariateNames) {
|
||||
boolean foundClass = false;
|
||||
for (Class<?> covClass : classes) {
|
||||
if ((covariateName + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate) covClass.newInstance();
|
||||
covariate.initialize(RAC);
|
||||
requestedCovariates.add(covariate);
|
||||
covariateList.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
if (!foundClass)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (covariateName + "Covariate") + ") isn't a valid covariate option.");
|
||||
}
|
||||
foundRequiredCovariates = foundRequiredCovariates || requiredCovariatesLine;
|
||||
foundOptionalCovariates = foundOptionalCovariates || optionalCovariatesLine;
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
else if (!line.startsWith("#")) { // if this is not a comment line that we don't care about, it is DATA!
|
||||
if (!foundRequiredCovariates || !foundOptionalCovariates) // At this point all the covariates should have been found and initialized
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
if (!initializedKeyManagers) {
|
||||
ArrayList<Covariate> emptyList = new ArrayList<Covariate>(0);
|
||||
ArrayList<Covariate> requiredCovariatesUpToThis = new ArrayList<Covariate>(); // Initialize one key manager for each table of required covariate
|
||||
for (Covariate covariate : requiredCovariates) { // Every required covariate table includes all preceding required covariates (e.g. RG ; RG,Q )
|
||||
requiredCovariatesUpToThis.add(covariate);
|
||||
keyManagers.add(new BQSRKeyManager(requiredCovariatesUpToThis, emptyList));
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
keyManagers.add(new BQSRKeyManager(requiredCovariates, optionalCovariates)); // One master key manager for the collapsed tables
|
||||
|
||||
initializedKeyManagers = true;
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
} catch (NumberFormatException e) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
if (!sawEOF) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE );
|
||||
generateEmpiricalQualities(SMOOTHING_CONSTANT);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
*
|
||||
* @param file The CSV file we read the line from (for exception throwing purposes)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
boolean hasOptionalCovariates = optionalCovariates.size() > 0; // Do we have optional covariates in this key?
|
||||
int addOptionalCovariates = hasOptionalCovariates ? 2 : 0; // If we have optional covariates at all, add two to the size of the array (to acommodate the covariate and the id)
|
||||
final Object[] key = new Object[requiredCovariates.size() + addOptionalCovariates + 1]; // Reserve enough space for the required covariates, optional covariate, id and eventType
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
int indexCovariateValue = key.length - 3; // In the order of keys, the optional covariate comes right after the required covariates
|
||||
int indexCovariateID = key.length - 2; // followed by the covariate ID
|
||||
int indexEventType = key.length - 1; // and the event type
|
||||
|
||||
addKeysToArray(key, vals, requiredCovariates, 0); // Add the required covariates keys
|
||||
|
||||
if (hasOptionalCovariates) {
|
||||
key[indexCovariateID] = Short.parseShort(vals[indexCovariateID]); // Add the optional covariate ID
|
||||
Covariate covariate = optionalCovariates.get((Short) key[indexCovariateID]); // Get the covariate object for this ID
|
||||
key[indexCovariateValue] = covariate.getValue(vals[indexCovariateValue]); // Add the optional covariate value, given the ID
|
||||
}
|
||||
key[indexEventType] = EventType.eventFrom(vals[indexEventType]); // Add the event type
|
||||
|
||||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
final String modelString = vals[iii++];
|
||||
final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString);
|
||||
int datumIndex = key.length; // The recal datum starts at the end of the key (after the event type)
|
||||
long count = Long.parseLong(vals[datumIndex]); // Number of observations
|
||||
long errors = Long.parseLong(vals[datumIndex + 1]); // Number of errors observed
|
||||
double reportedQual = Double.parseDouble(vals[1]); // The reported Q score --> todo -- I don't like having the Q score hard coded in vals[1]. Generalize it!
|
||||
final RecalDatum datum = new RecalDatum(count, errors, reportedQual, 0.0); // Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
|
||||
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
addToAllTables(key, datum); // Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
}
|
||||
|
||||
public void recalibrateRead( final GATKSAMRecord read ) {
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
*
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
*/
|
||||
private void addToAllTables(final Object[] key, final RecalDatum fullDatum) {
|
||||
int nHashes = requiredCovariates.size(); // We will always need one hash per required covariate
|
||||
if (optionalCovariates.size() > 0) // If we do have optional covariates
|
||||
nHashes += 1; // we will need one extra hash table with the optional covariate encoded in the key set on top of the required covariates
|
||||
|
||||
|
||||
for (int hashIndex = 0; hashIndex < nHashes; hashIndex++) {
|
||||
HashMap<BitSet, RecalDatum> table; // object to hold the hash table we are going to manipulate
|
||||
if (hashIndex >= collapsedHashes.size()) { // if we haven't yet created the collapsed hash table for this index, create it now!
|
||||
table = new HashMap<BitSet, RecalDatum>();
|
||||
collapsedHashes.add(table); // Because this is the only place where we add tables to the ArrayList, they will always be in the order we want.
|
||||
}
|
||||
else
|
||||
table = collapsedHashes.get(hashIndex); // if the table has been previously created, just assign it to the "table" object for manipulation
|
||||
|
||||
int copyTo = hashIndex + 1; // this will copy the covariates up to the index of the one we are including now (1 for RG, 2 for QS,...)
|
||||
if (copyTo > requiredCovariates.size()) // only in the case where we have optional covariates we need to increase the size of the array
|
||||
copyTo = requiredCovariates.size() + 2; // if we have optional covarites, add the optional covariate and it's id to the size of the key
|
||||
Object[] tableKey = new Object[copyTo + 1]; // create a new array that will hold as many keys as hashIndex (1 for RG hash, 2 for QualityScore hash, 3 for covariate hash plus the event type
|
||||
System.arraycopy(key, 0, tableKey, 0, copyTo); // copy the keys for the corresponding covariates into the tableKey.
|
||||
tableKey[tableKey.length-1] = key[key.length - 1]; // add the event type. The event type is always the last key, on both key sets.
|
||||
|
||||
BitSet hashKey = keyManagers.get(hashIndex).bitSetFromKey(tableKey); // Add bitset key with fullDatum to the appropriate hash
|
||||
RecalDatum datum = table.get(hashKey);
|
||||
if (datum == null)
|
||||
datum = fullDatum;
|
||||
else if (hashIndex == 0) // Special case for the ReadGroup covariate
|
||||
datum.combine(fullDatum);
|
||||
else
|
||||
datum.increment(fullDatum);
|
||||
table.put(hashKey, datum);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
*
|
||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||
*/
|
||||
private void generateEmpiricalQualities(final int smoothing) {
|
||||
for (final HashMap<BitSet, RecalDatum> table : collapsedHashes)
|
||||
for (final RecalDatum datum : table.values())
|
||||
datum.calcCombinedEmpiricalQuality(smoothing, QualityUtils.MAX_QUAL_SCORE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public void recalibrateRead(final GATKSAMRecord read) {
|
||||
//compute all covariate values for this read
|
||||
RecalDataManager.computeCovariates(read, requestedCovariates);
|
||||
final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read );
|
||||
final ReadCovariates readCovariates = RecalDataManager.covariateKeySetFrom(read);
|
||||
|
||||
for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) {
|
||||
final byte[] originalQuals = read.getBaseQualities( errorModel );
|
||||
for (final EventType errorModel : EventType.values()) {
|
||||
final byte[] originalQuals = read.getBaseQualities(errorModel);
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel);
|
||||
final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates
|
||||
|
||||
// BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables?
|
||||
//Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode);
|
||||
//if( qualityScore == null ) {
|
||||
final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey );
|
||||
// qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode);
|
||||
//}
|
||||
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) {
|
||||
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel);
|
||||
final byte qualityScore = performSequentialQualityCalculation(keySet, errorModel);
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
read.setBaseQualities( recalQuals, errorModel );
|
||||
|
||||
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
|
||||
read.setBaseQualities(recalQuals, errorModel);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
*
|
||||
* todo -- I extremely dislike the way all this math is hardcoded... should rethink the data structures for this method in particular.
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @param errorModel the event type
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
|
||||
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
|
||||
|
||||
final int readGroupKeyIndex = 0;
|
||||
final int qualKeyIndex = 1;
|
||||
final int covariatesKeyIndex = 2;
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey ));
|
||||
List<BitSet> bitKeys = keyManagers.get(readGroupKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException("There should only be one key for the RG collapsed table, something went wrong here");
|
||||
|
||||
final RecalDatum globalRecalDatum = collapsedHashes.get(readGroupKeyIndex).get(bitKeys.get(0));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
if (globalRecalDatum != null) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey ));
|
||||
bitKeys = keyManagers.get(qualKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||
if (bitKeys.size() > 1)
|
||||
throw new ReviewedStingException("There should only be one key for the Qual collapsed table, something went wrong here");
|
||||
|
||||
final RecalDatum qReportedRecalDatum = collapsedHashes.get(qualKeyIndex).get(bitKeys.get(0));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
if (qReportedRecalDatum != null) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
bitKeys = keyManagers.get(covariatesKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
for (BitSet k : bitKeys) {
|
||||
final RecalDatum covariateRecalDatum = collapsedHashes.get(covariatesKeyIndex).get(k);
|
||||
if (covariateRecalDatum != null) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
*
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
|
||||
for (int iii = 0; iii < recalQuals.length; iii++) {
|
||||
if (originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality to add keys
|
||||
*
|
||||
* @param array the target array we are creating the keys in
|
||||
* @param keys the actual keys we're using as a source
|
||||
* @param covariateList the covariate list to loop through
|
||||
* @param keyIndex the index in the keys and the arrays objects to run from
|
||||
*/
|
||||
private void addKeysToArray(final Object[] array, final String[] keys, List<Covariate> covariateList, int keyIndex) {
|
||||
for (Covariate covariate : covariateList) {
|
||||
array[keyIndex] = covariate.getValue(keys[keyIndex]);
|
||||
keyIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam;
|
|||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -233,7 +234,17 @@ public class ArtificialSAMUtils {
|
|||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord createArtificialRead(Cigar cigar) {
|
||||
int length = cigar.getReadLength();
|
||||
byte [] base = {'A'};
|
||||
byte [] qual = {30};
|
||||
byte [] bases = Utils.arrayFromArrayWithLength(base, length);
|
||||
byte [] quals = Utils.arrayFromArrayWithLength(qual, length);
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString());
|
||||
}
|
||||
|
||||
|
||||
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
||||
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
||||
GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
|
||||
|
|
@ -361,10 +372,10 @@ public class ArtificialSAMUtils {
|
|||
final GATKSAMRecord left = pair.get(0);
|
||||
final GATKSAMRecord right = pair.get(1);
|
||||
|
||||
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false));
|
||||
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false, false, false));
|
||||
|
||||
if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) {
|
||||
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false));
|
||||
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false, false, false));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -165,7 +165,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
/**
|
||||
* Setters and Accessors for base insertion and base deletion quality scores
|
||||
*/
|
||||
public void setBaseQualities( final byte[] quals, final RecalDataManager.BaseRecalibrationType errorModel ) {
|
||||
public void setBaseQualities( final byte[] quals, final EventType errorModel ) {
|
||||
switch( errorModel ) {
|
||||
case BASE_SUBSTITUTION:
|
||||
setBaseQualities(quals);
|
||||
|
|
@ -181,7 +181,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
}
|
||||
}
|
||||
|
||||
public byte[] getBaseQualities( final RecalDataManager.BaseRecalibrationType errorModel ) {
|
||||
public byte[] getBaseQualities( final EventType errorModel ) {
|
||||
switch( errorModel ) {
|
||||
case BASE_SUBSTITUTION:
|
||||
return getBaseQualities();
|
||||
|
|
@ -204,7 +204,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||
setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION);
|
||||
setBaseQualities(quals, EventType.BASE_INSERTION);
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
|
@ -213,9 +213,9 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) );
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||
setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_DELETION);
|
||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||
setBaseQualities(quals, EventType.BASE_DELETION);
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam;
|
|||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -495,7 +496,7 @@ public class ReadUtils {
|
|||
/**
|
||||
* Is a base inside a read?
|
||||
*
|
||||
* @param read the read to evaluate
|
||||
* @param read the read to evaluate
|
||||
* @param referenceCoordinate the reference coordinate of the base to test
|
||||
* @return true if it is inside the read, false otherwise.
|
||||
*/
|
||||
|
|
@ -541,9 +542,9 @@ public class ReadUtils {
|
|||
*
|
||||
* See getCoverageDistributionOfRead for information on how the coverage is calculated.
|
||||
*
|
||||
* @param list the list of reads covering the region
|
||||
* @param list the list of reads covering the region
|
||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @return an array with the coverage of each position from startLocation to stopLocation
|
||||
*/
|
||||
public static int [] getCoverageDistributionOfReads(List<GATKSAMRecord> list, int startLocation, int stopLocation) {
|
||||
|
|
@ -563,9 +564,9 @@ public class ReadUtils {
|
|||
* Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
|
||||
* reads for variant regions, and deletions count as variants)
|
||||
*
|
||||
* @param read the read to get the coverage distribution of
|
||||
* @param read the read to get the coverage distribution of
|
||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @return an array with the coverage of each position from startLocation to stopLocation
|
||||
*/
|
||||
public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) {
|
||||
|
|
@ -611,9 +612,9 @@ public class ReadUtils {
|
|||
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage.
|
||||
* Example: Read => {true, true, false, ... false}
|
||||
*
|
||||
* @param readList the list of reads to generate the association mappings
|
||||
* @param readList the list of reads to generate the association mappings
|
||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||
* @return the two hashmaps described above
|
||||
*/
|
||||
public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>> , HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings (List<GATKSAMRecord> readList, int startLocation, int stopLocation) {
|
||||
|
|
@ -622,7 +623,6 @@ public class ReadUtils {
|
|||
HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
|
||||
HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);
|
||||
|
||||
|
||||
for (int i = startLocation; i <= stopLocation; i++)
|
||||
locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists
|
||||
|
||||
|
|
@ -631,7 +631,7 @@ public class ReadUtils {
|
|||
|
||||
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
|
||||
|
||||
for (int i=0; i<readCoverage.length; i++) {
|
||||
for (int i = 0; i < readCoverage.length; i++) {
|
||||
int refLocation = i + startLocation;
|
||||
if (readCoverage[i] > 0) {
|
||||
// Update the hash for this locus
|
||||
|
|
@ -649,6 +649,55 @@ public class ReadUtils {
|
|||
return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create random read qualities
|
||||
*
|
||||
* @param length the length of the read
|
||||
* @return an array with randomized base qualities between 0 and 50
|
||||
*/
|
||||
public static byte[] createRandomReadQuals(int length) {
|
||||
Random random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
byte[] quals = new byte[length];
|
||||
for (int i = 0; i < length; i++)
|
||||
quals[i] = (byte) random.nextInt(50);
|
||||
return quals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create random read qualities
|
||||
*
|
||||
* @param length the length of the read
|
||||
* @param allowNs whether or not to allow N's in the read
|
||||
* @return an array with randomized bases (A-N) with equal probability
|
||||
*/
|
||||
public static byte[] createRandomReadBases(int length, boolean allowNs) {
|
||||
Random random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
int numberOfBases = allowNs ? 5 : 4;
|
||||
byte[] bases = new byte[length];
|
||||
for (int i = 0; i < length; i++) {
|
||||
switch (random.nextInt(numberOfBases)) {
|
||||
case 0:
|
||||
bases[i] = 'A';
|
||||
break;
|
||||
case 1:
|
||||
bases[i] = 'C';
|
||||
break;
|
||||
case 2:
|
||||
bases[i] = 'G';
|
||||
break;
|
||||
case 3:
|
||||
bases[i] = 'T';
|
||||
break;
|
||||
case 4:
|
||||
bases[i] = 'N';
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("Something went wrong, this is just impossible");
|
||||
}
|
||||
}
|
||||
return bases;
|
||||
}
|
||||
|
||||
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
|
||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||
int sequenceRecordIndex = 0;
|
||||
|
|
@ -656,4 +705,5 @@ public class ReadUtils {
|
|||
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
||||
return Arrays.deepToString(sequenceRecordNames);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,13 +25,10 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.jgrapht.util.MathUtil;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class GenotypeLikelihoods {
|
||||
public static final boolean CAP_PLS = false;
|
||||
|
|
@ -201,4 +198,118 @@ public class GenotypeLikelihoods {
|
|||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/*
|
||||
* Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index.
|
||||
* Note that the reference allele is always index=0.
|
||||
*/
|
||||
public static class GenotypeLikelihoodsAllelePair {
|
||||
public final int alleleIndex1, alleleIndex2;
|
||||
|
||||
public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) {
|
||||
this.alleleIndex1 = alleleIndex1;
|
||||
this.alleleIndex2 = alleleIndex2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
|
||||
*/
|
||||
private static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[]{ new GenotypeLikelihoodsAllelePair(0, 0) };
|
||||
|
||||
private static void calculatePLcache(final int minIndex) {
|
||||
// how many alternate alleles do we need to calculate for?
|
||||
int altAlleles = 0;
|
||||
int numLikelihoods = 1;
|
||||
while ( numLikelihoods <= minIndex ) {
|
||||
altAlleles++;
|
||||
numLikelihoods += altAlleles + 1;
|
||||
}
|
||||
|
||||
PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[numLikelihoods];
|
||||
|
||||
// for all possible combinations of 2 alleles
|
||||
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
|
||||
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
|
||||
PLIndexToAlleleIndex[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// how many likelihoods are associated with the given number of alternate alleles?
|
||||
public static int calculateNumLikelihoods(int numAltAlleles) {
|
||||
int numLikelihoods = 1;
|
||||
for ( int i = 1; i <= numAltAlleles; i++ )
|
||||
numLikelihoods += i + 1;
|
||||
return numLikelihoods;
|
||||
}
|
||||
|
||||
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
|
||||
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
|
||||
// Assumes that allele1Index < allele2Index
|
||||
public static int calculatePLindex(final int allele1Index, final int allele2Index) {
|
||||
return (allele2Index * (allele2Index+1) / 2) + allele1Index;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the allele index pair for the given PL
|
||||
*
|
||||
* @param PLindex the PL index
|
||||
* @return the allele index pair
|
||||
*/
|
||||
public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) {
|
||||
// make sure that we've cached enough data
|
||||
if ( PLindex >= PLIndexToAlleleIndex.length )
|
||||
calculatePLcache(PLindex);
|
||||
|
||||
return PLIndexToAlleleIndex[PLindex];
|
||||
}
|
||||
|
||||
// An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles
|
||||
protected static int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9};
|
||||
|
||||
/**
|
||||
* get the allele index pair for the given PL using the deprecated PL ordering:
|
||||
* AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD.
|
||||
* Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated
|
||||
* ordering and I know with certainty that external users have built code on top of it; changing it now would
|
||||
* cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method.
|
||||
* This method assumes at most 3 alternate alleles.
|
||||
* TODO -- address this issue at the source by updating DiploidSNPGenotypeLikelihoods.
|
||||
*
|
||||
* @param PLindex the PL index
|
||||
* @return the allele index pair
|
||||
*/
|
||||
public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) {
|
||||
final int convertedIndex = PLindexConversion[PLindex];
|
||||
|
||||
// make sure that we've cached enough data
|
||||
if ( convertedIndex >= PLIndexToAlleleIndex.length )
|
||||
calculatePLcache(convertedIndex);
|
||||
|
||||
return PLIndexToAlleleIndex[convertedIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index.
|
||||
*
|
||||
* @param allele1Index the index in VariantContext.getAllele() of the first allele
|
||||
* @param allele2Index the index in VariantContext.getAllele() of the second allele
|
||||
* @return the PL indexes
|
||||
*/
|
||||
public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) {
|
||||
|
||||
final int[] indexes = new int[3];
|
||||
indexes[0] = calculatePLindex(allele1Index, allele1Index);
|
||||
indexes[1] = calculatePLindex(allele1Index, allele2Index);
|
||||
indexes[2] = calculatePLindex(allele2Index, allele2Index);
|
||||
return indexes;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -656,12 +656,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
return alleles.get(i+1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param other VariantContext whose alleles to compare against
|
||||
* @return true if this VariantContext has the same alleles (both ref and alts) as other,
|
||||
* regardless of ordering. Otherwise returns false.
|
||||
*/
|
||||
public boolean hasSameAllelesAs ( final VariantContext other ) {
|
||||
return hasSameAlternateAllelesAs(other) && other.getReference().equals(getReference(), false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param other VariantContext whose alternate alleles to compare against
|
||||
* @return true if this VariantContext has the same alternate alleles as other,
|
||||
* regardless of ordering. Otherwise returns false.
|
||||
*/
|
||||
public boolean hasSameAlternateAllelesAs ( VariantContext other ) {
|
||||
public boolean hasSameAlternateAllelesAs ( final VariantContext other ) {
|
||||
List<Allele> thisAlternateAlleles = getAlternateAlleles();
|
||||
List<Allele> otherAlternateAlleles = other.getAlternateAlleles();
|
||||
|
||||
|
|
@ -1246,40 +1255,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
return best;
|
||||
}
|
||||
|
||||
public int[] getGLIndecesOfAllele(Allele inputAllele) {
|
||||
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
|
||||
|
||||
// TODO -- this information is cached statically by the UnifiedGenotyperEngine; pull it out into a common utils class for all to use
|
||||
|
||||
int[] idxVector = new int[3];
|
||||
int numAlleles = this.getAlleles().size();
|
||||
|
||||
int idxDiag = numAlleles;
|
||||
int incr = numAlleles - 1;
|
||||
int k=1;
|
||||
for (Allele a: getAlternateAlleles()) {
|
||||
// multi-allelic approximation, part 1: Ideally
|
||||
// for each alt allele compute marginal (suboptimal) posteriors -
|
||||
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
|
||||
// as a row-wise upper triangular matrix of likelihoods.
|
||||
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
|
||||
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
|
||||
|
||||
int idxAA = 0;
|
||||
int idxAB = k++;
|
||||
// yy is always element on the diagonal.
|
||||
// 2 alleles: BBelement 2
|
||||
// 3 alleles: BB element 3. CC element 5
|
||||
// 4 alleles:
|
||||
int idxBB = idxDiag;
|
||||
|
||||
if (a.equals(inputAllele)) {
|
||||
idxVector[0] = idxAA;
|
||||
idxVector[1] = idxAB;
|
||||
idxVector[2] = idxBB;
|
||||
int index = 1;
|
||||
for ( Allele allele : getAlternateAlleles() ) {
|
||||
if ( allele.equals(targetAllele) )
|
||||
break;
|
||||
}
|
||||
idxDiag += incr--;
|
||||
index++;
|
||||
}
|
||||
return idxVector;
|
||||
|
||||
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import com.google.java.contract.*;
|
|||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -344,6 +345,21 @@ public class VariantContextBuilder {
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have the specified location
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
@Requires({"loc.getContig() != null", "loc.getStart() >= 0", "loc.getStop() >= 0"})
|
||||
public VariantContextBuilder loc(final GenomeLoc loc) {
|
||||
this.contig = loc.getContig();
|
||||
this.start = loc.getStart();
|
||||
this.stop = loc.getStop();
|
||||
toValidate.add(VariantContext.Validation.ALLELES);
|
||||
toValidate.add(VariantContext.Validation.REF_PADDING);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells us that the resulting VariantContext should have the specified contig chr
|
||||
* @param contig
|
||||
|
|
|
|||
|
|
@ -458,7 +458,7 @@ public class VariantContextUtils {
|
|||
|
||||
/**
|
||||
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
|
||||
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
||||
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
||||
* the sample name
|
||||
*
|
||||
* @param genomeLocParser loc parser
|
||||
|
|
@ -492,11 +492,11 @@ public class VariantContextUtils {
|
|||
if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE )
|
||||
verifyUniqueSampleNames(unsortedVCs);
|
||||
|
||||
List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
|
||||
final List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
|
||||
// Make sure all variant contexts are padded with reference base in case of indels if necessary
|
||||
List<VariantContext> VCs = new ArrayList<VariantContext>();
|
||||
final List<VariantContext> VCs = new ArrayList<VariantContext>();
|
||||
|
||||
for (VariantContext vc : prepaddedVCs) {
|
||||
for (final VariantContext vc : prepaddedVCs) {
|
||||
// also a reasonable place to remove filtered calls, if needed
|
||||
if ( ! filteredAreUncalled || vc.isNotFiltered() )
|
||||
VCs.add(createVariantContextWithPaddedAlleles(vc, false));
|
||||
|
|
@ -531,7 +531,7 @@ public class VariantContextUtils {
|
|||
|
||||
// cycle through and add info from the other VCs, making sure the loc/reference matches
|
||||
|
||||
for ( VariantContext vc : VCs ) {
|
||||
for ( final VariantContext vc : VCs ) {
|
||||
if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) )
|
||||
throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
|
||||
|
||||
|
|
@ -581,13 +581,13 @@ public class VariantContextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
|
||||
for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
|
||||
String key = p.getKey();
|
||||
// if we don't like the key already, don't go anywhere
|
||||
if ( ! inconsistentAttributes.contains(key) ) {
|
||||
boolean alreadyFound = attributes.containsKey(key);
|
||||
Object boundValue = attributes.get(key);
|
||||
boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
|
||||
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
|
||||
|
||||
if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) {
|
||||
// we found the value but we're inconsistent, put it in the exclude list
|
||||
|
|
@ -604,7 +604,7 @@ public class VariantContextUtils {
|
|||
|
||||
// if we have more alternate alleles in the merged VC than in one or more of the
|
||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
|
||||
for ( VariantContext vc : VCs ) {
|
||||
for ( final VariantContext vc : VCs ) {
|
||||
if (vc.alleles.size() == 1)
|
||||
continue;
|
||||
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
||||
|
|
@ -634,11 +634,11 @@ public class VariantContextUtils {
|
|||
setValue = MERGE_INTERSECTION;
|
||||
else if ( nFiltered == VCs.size() ) // everything was filtered out
|
||||
setValue = MERGE_FILTER_IN_ALL;
|
||||
else if ( variantSources.isEmpty() ) // everyone was reference
|
||||
else if ( variantSources.isEmpty() ) // everyone was reference
|
||||
setValue = MERGE_REF_IN_ALL;
|
||||
else {
|
||||
LinkedHashSet<String> s = new LinkedHashSet<String>();
|
||||
for ( VariantContext vc : VCs )
|
||||
final LinkedHashSet<String> s = new LinkedHashSet<String>();
|
||||
for ( final VariantContext vc : VCs )
|
||||
if ( vc.isVariant() )
|
||||
s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() );
|
||||
setValue = Utils.join("-", s);
|
||||
|
|
@ -663,7 +663,7 @@ public class VariantContextUtils {
|
|||
builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
|
||||
|
||||
// Trim the padded bases of all alleles if necessary
|
||||
VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
|
||||
final VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
|
||||
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
|
||||
return merged;
|
||||
}
|
||||
|
|
@ -724,7 +724,7 @@ public class VariantContextUtils {
|
|||
|
||||
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||
|
||||
for (Allele a : inputVC.getAlleles()) {
|
||||
for (final Allele a : inputVC.getAlleles()) {
|
||||
if (a.isSymbolic()) {
|
||||
alleles.add(a);
|
||||
originalToTrimmedAlleleMap.put(a, a);
|
||||
|
|
@ -741,11 +741,9 @@ public class VariantContextUtils {
|
|||
// example: mixed records such as {TA*,TGA,TG}
|
||||
boolean hasNullAlleles = false;
|
||||
|
||||
for (Allele a: originalToTrimmedAlleleMap.values()) {
|
||||
for (final Allele a: originalToTrimmedAlleleMap.values()) {
|
||||
if (a.isNull())
|
||||
hasNullAlleles = true;
|
||||
if (a.isReference())
|
||||
refAllele = a;
|
||||
}
|
||||
|
||||
if (!hasNullAlleles)
|
||||
|
|
@ -755,7 +753,7 @@ public class VariantContextUtils {
|
|||
|
||||
List<Allele> originalAlleles = genotype.getAlleles();
|
||||
List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
||||
for ( Allele a : originalAlleles ) {
|
||||
for ( final Allele a : originalAlleles ) {
|
||||
if ( a.isCalled() )
|
||||
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
||||
else
|
||||
|
|
@ -837,7 +835,6 @@ public class VariantContextUtils {
|
|||
public AlleleMapper(Map<Allele, Allele> map) { this.map = map; }
|
||||
public boolean needsRemapping() { return this.map != null; }
|
||||
public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); }
|
||||
|
||||
public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; }
|
||||
|
||||
public List<Allele> remap(List<Allele> as) {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ import java.util.*;
|
|||
|
||||
public class WalkerTest extends BaseTest {
|
||||
private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false;
|
||||
private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false;
|
||||
|
||||
@BeforeMethod
|
||||
public void initializeRandomGenerator() {
|
||||
|
|
@ -58,6 +59,9 @@ public class WalkerTest extends BaseTest {
|
|||
}
|
||||
|
||||
public void maybeValidateSupplementaryFile(final String name, final File resultFile) {
|
||||
if ( !ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX )
|
||||
return;
|
||||
|
||||
File indexFile = Tribble.indexFile(resultFile);
|
||||
//System.out.println("Putative index file is " + indexFile);
|
||||
if ( indexFile.exists() ) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/20/12
|
||||
*/
|
||||
public class BadCigarFilterUnitTest {
|
||||
|
||||
BadCigarFilter filter;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
filter = new BadCigarFilter();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWonkyCigars () {
|
||||
byte[] bases = {'A', 'A', 'A', 'A'};
|
||||
byte[] quals = {30, 30, 30, 30};
|
||||
GATKSAMRecord read;
|
||||
// starting with multiple deletions
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
|
||||
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
|
||||
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||
}
|
||||
}
|
||||
|
|
@ -80,11 +80,15 @@ public class GATKReportUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testSimpleGATKReport() {
|
||||
GATKReport report = GATKReport.newSimpleReport("TableName", "a", "b", "Roger", "is", "Awesome");
|
||||
report.addRow("a", 'F', 12, 23.45, true);
|
||||
report.addRow("ans", '3', 24.5, 456L, 2345);
|
||||
report.addRow("hi", null, null, "", 2.3);
|
||||
// Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome
|
||||
GATKReport report = GATKReport.newSimpleReport("TableName", "Roger", "is", "Awesome");
|
||||
|
||||
// Add data to simple GATK report
|
||||
report.addRow( 12, 23.45, true);
|
||||
report.addRow("ans", '3', 24.5);
|
||||
report.addRow("hi", "", 2.3);
|
||||
|
||||
// Print the report to console
|
||||
//report.print(System.out);
|
||||
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.activeregionqc;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Tests CountReadsInActiveRegions
|
||||
*/
|
||||
public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void basicTest() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s",
|
||||
1,
|
||||
Arrays.asList("fcd581aa6befe85c7297509fa7b34edf"));
|
||||
executeTest("CountReadsInActiveRegions:", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/7/12
|
||||
*/
|
||||
public class BQSRGathererUnitTest {
|
||||
RecalibrationArgumentCollection RAC;
|
||||
|
||||
private static File recal1 = new File("public/testdata/exampleCSV.csv");
|
||||
private static File recal2 = new File("public/testdata/exampleCSV.2.csv");
|
||||
|
||||
@Test(enabled = false)
|
||||
public void testCombineTwoFiles() {
|
||||
BQSRGatherer gatherer = new BQSRGatherer();
|
||||
List<File> recalFiles = new LinkedList<File> ();
|
||||
File output = new File("foo.csv");
|
||||
|
||||
recalFiles.add(recal1);
|
||||
recalFiles.add(recal2);
|
||||
gatherer.gather(recalFiles, output);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,9 +1,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -12,37 +14,13 @@ import java.util.BitSet;
|
|||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* <p>
|
||||
* [Long description of the walker]
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Description of the Input]
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Description of the Output]
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T [walker name]
|
||||
* </pre>
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/1/12
|
||||
*/
|
||||
public class ContextCovariateUnitTest {
|
||||
ContextCovariate covariate;
|
||||
RecalibrationArgumentCollection RAC;
|
||||
Random random;
|
||||
Random random;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
|
|
@ -55,49 +33,33 @@ public class ContextCovariateUnitTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testSimpleContexts() {
|
||||
byte [] quals = createRandomReadQuals(101);
|
||||
byte [] bbases = createRandomReadBases(101);
|
||||
String bases = stringFrom(bbases);
|
||||
byte[] quals = ReadUtils.createRandomReadQuals(10000);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(10000, true);
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
|
||||
CovariateValues values = covariate.getValues(read);
|
||||
verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases);
|
||||
verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases);
|
||||
verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases);
|
||||
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||
}
|
||||
|
||||
|
||||
private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) {
|
||||
for (int i=0; i<values.length; i++) {
|
||||
if (i >= contextSize)
|
||||
Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i));
|
||||
else
|
||||
Assert.assertNull(values[i]);
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
String expectedContext = null;
|
||||
if (i >= contextSize) {
|
||||
String context = bases.substring(i - contextSize, i);
|
||||
if (!context.contains("N"))
|
||||
expectedContext = context;
|
||||
}
|
||||
Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext);
|
||||
}
|
||||
}
|
||||
|
||||
private String stringFrom(byte [] array) {
|
||||
private String stringFrom(byte[] array) {
|
||||
String s = "";
|
||||
for (byte value : array)
|
||||
s += (char) value;
|
||||
return s;
|
||||
}
|
||||
|
||||
private byte [] createRandomReadQuals(int length) {
|
||||
byte [] quals = new byte[length];
|
||||
for (int i=0; i<length; i++)
|
||||
quals[i] = (byte) random.nextInt(50);
|
||||
return quals;
|
||||
}
|
||||
|
||||
private byte [] createRandomReadBases(int length) {
|
||||
byte [] bases = new byte[length];
|
||||
for (int i=0; i<length; i++) {
|
||||
switch(random.nextInt(4)) {
|
||||
case 0: bases[i] = 'A'; break;
|
||||
case 1: bases[i] = 'C'; break;
|
||||
case 2: bases[i] = 'G'; break;
|
||||
case 3: bases[i] = 'T'; break;
|
||||
}
|
||||
}
|
||||
return bases;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,59 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/1/12
|
||||
*/
|
||||
public class CycleCovariateUnitTest {
|
||||
CycleCovariate covariate;
|
||||
RecalibrationArgumentCollection RAC;
|
||||
Random random;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
RAC = new RecalibrationArgumentCollection();
|
||||
covariate = new CycleCovariate();
|
||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
covariate.initialize(RAC);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testSimpleCycles() {
|
||||
short readLength = 10;
|
||||
byte[] quals = ReadUtils.createRandomReadQuals(readLength);
|
||||
byte[] bbases = ReadUtils.createRandomReadBases(readLength, true);
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
||||
read.getReadGroup().setPlatform("illumina");
|
||||
|
||||
CovariateValues values = covariate.getValues(read);
|
||||
verifyCovariateArray(values.getMismatches(), (short) 1, (short) 1);
|
||||
|
||||
read.setReadNegativeStrandFlag(true);
|
||||
values = covariate.getValues(read);
|
||||
verifyCovariateArray(values.getMismatches(), readLength, (short) -1);
|
||||
|
||||
}
|
||||
|
||||
private void verifyCovariateArray(BitSet[] values, short init, short increment) {
|
||||
for (short i = 0; i < values.length; i++) {
|
||||
short actual = Short.decode(covariate.keyFromBitSet(values[i]));
|
||||
int expected = init + (increment * i);
|
||||
// System.out.println(String.format("%d: %d, %d", i, actual, expected));
|
||||
Assert.assertEquals(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -50,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
|
|||
|
||||
@DataProvider(name = "data")
|
||||
public Object[][] createData() {
|
||||
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dac62fcd25e1052bf18b5707700dda7e");
|
||||
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "e10c48dd294fb257802d4e73bb50580d");
|
||||
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dba5eab2b9587c1062721b164e4fd9a6");
|
||||
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "de35c93450b46db5fc5516af3c55d62a");
|
||||
return TestParams.getTests(TestParams.class);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
|
|||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,8 +5,10 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
// ********************************************************************************** //
|
||||
|
|
@ -60,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("5af005255240a2186f04cb50851b8b6f"));
|
||||
Arrays.asList("0de4aeed6a52f08ed86a7642c812478b"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -277,52 +279,53 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testWithIndelAllelesPassedIn1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("9cd08dc412a007933381e9c76c073899"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithIndelAllelesPassedIn2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithIndelAllelesPassedIn3() {
|
||||
public void testMultiSampleIndels() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("52340d578a708fa709b69ce48987bc9d"));
|
||||
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
||||
Arrays.asList("2609675a356f2dfc86f8a1d911210978"));
|
||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("9566c7abef5ee5829a516d90445b347f"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithIndelAllelesPassedIn4() {
|
||||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
||||
Arrays.asList("4fdd8da77167881b71b3547da5c13f94"));
|
||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithIndelAllelesPassedIn5() {
|
||||
public void testGGAwithNoEvidenceInReads() {
|
||||
final String vcf = "small.indel.test.vcf";
|
||||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + vcf + " -I " + validationDataLocation +
|
||||
"NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
|
||||
Arrays.asList("7d069596597aee5e0d562964036141eb"));
|
||||
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec4);
|
||||
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing SnpEff
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testSnpEffAnnotationRequestedWithoutRodBinding() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
|
|||
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
|
||||
" -genotypeMergeOptions UNIQUIFY -L 1"),
|
||||
1,
|
||||
Arrays.asList("ab72f4bfb16d3894942149173a087647"));
|
||||
Arrays.asList("ee43a558fd3faeaa447acab89f0001d5"));
|
||||
executeTest("threeWayWithRefs", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,75 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/5/12
|
||||
*/
|
||||
|
||||
public class BitSetUtilsUnitTest {
|
||||
private static int RANDOM_NUMBERS_TO_TRY = 87380;
|
||||
private static Random random;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLongBitSet() {
|
||||
long[] numbers = {0L, 1L, 428L, 65536L, 239847L, 4611686018427387903L, Long.MAX_VALUE, Long.MIN_VALUE, -1L, -2L, -7L, -128L, -65536L, -100000L};
|
||||
for (long n : numbers)
|
||||
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n);
|
||||
|
||||
for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) {
|
||||
long n = random.nextLong();
|
||||
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); // Because class Random uses a seed with only 48 bits, this algorithm will not return all possible long values.
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testShortBitSet() {
|
||||
short[] numbers = {0, 1, 428, 25934, 23847, 16168, Short.MAX_VALUE, Short.MIN_VALUE, -1, -2, -7, -128, -12312, -31432};
|
||||
for (long n : numbers)
|
||||
Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n);
|
||||
|
||||
for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) {
|
||||
short n = (short) random.nextInt();
|
||||
Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDNAAndBitSetConversion() {
|
||||
String[] dna = {"AGGTGTTGT", "CCCCCCCCCCCCCC", "GGGGGGGGGGGGGG", "TTTTTTTTTTTTTT", "GTAGACCGATCTCAGCTAGT", "AACGTCAATGCAGTCAAGTCAGACGTGGGTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"};
|
||||
|
||||
// Test all contexts of size 1-8.
|
||||
for (long n = 0; n < RANDOM_NUMBERS_TO_TRY; n++)
|
||||
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(n)))), n);
|
||||
|
||||
// Test the special cases listed in the dna array
|
||||
for (String d : dna)
|
||||
Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testNumberOfBitsToRepresent() {
|
||||
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(0), 0); // Make sure 0 elements need 0 bits to be represented
|
||||
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(1), 1); // Make sure 1 element needs 1 bit to be represented
|
||||
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(3), 2); // Make sure 3 elements need 2 bit to be represented
|
||||
|
||||
for (int i = 1; i < 63; i++) { // Can't test i == 63 because n1 is a negative number
|
||||
long n1 = 1L << i;
|
||||
long n2 = Math.abs(random.nextLong()) % n1;
|
||||
long n3 = n1 | n2;
|
||||
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n3), (n3 == n1) ? i : i + 1);
|
||||
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n1), i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
@ -131,7 +130,8 @@ public class MathUtilsUnitTest extends BaseTest {
|
|||
int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24};
|
||||
MathUtils.RunningAverage r = new MathUtils.RunningAverage();
|
||||
|
||||
for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]);
|
||||
for (int i = 0; i < numbers.length; i++)
|
||||
r.add((double) numbers[i]);
|
||||
|
||||
Assert.assertEquals((long) numbers.length, r.observationCount());
|
||||
Assert.assertTrue(r.mean() - 3224.625 < 2e-10);
|
||||
|
|
@ -223,37 +223,14 @@ public class MathUtilsUnitTest extends BaseTest {
|
|||
return set.isEmpty();
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIntAndBitSetConversion() {
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428);
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847);
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726);
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0);
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1);
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536);
|
||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDNAAndBitSetConversion() {
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT");
|
||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31)
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testApproximateLog10SumLog10() {
|
||||
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, 1e-3);
|
||||
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
||||
|
|
@ -266,55 +243,57 @@ public class MathUtilsUnitTest extends BaseTest {
|
|||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, 1e-3);
|
||||
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
||||
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3);
|
||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormalizeFromLog10() {
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, -1.0, -1.1, -7.8}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, 0.0, -0.1, -6.8}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[]{-8.9, -6.7, -9.4, 0.0, -8.9}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, -1.0, -1.1, -7.8}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, 0.0, -0.1, -6.8}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[] {-8.9, -6.7, -9.4, 0.0, -8.9}));
|
||||
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.0}), new double[]{0.25, 0.25, 0.25, 0.25}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -1.0}), new double[]{0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -2.0}), new double[]{0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.0}), new double[] {0.25, 0.25, 0.25, 0.25}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -1.0}), new double[] {0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301}));
|
||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Private function used by testNormalizeFromLog10()
|
||||
*/
|
||||
private boolean compareDoubleArrays(double[] b1, double[] b2) {
|
||||
if( b1.length != b2.length ) {
|
||||
if (b1.length != b2.length) {
|
||||
return false; // sanity check
|
||||
}
|
||||
|
||||
for( int i=0; i < b1.length; i++ ){
|
||||
if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 )
|
||||
for (int i = 0; i < b1.length; i++) {
|
||||
if (MathUtils.compareDoubles(b1[i], b2[i]) != 0)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/21/12
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Basic unit test for QualityUtils class
|
||||
*/
|
||||
public class QualityUtilsUnitTest extends BaseTest {
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQualCaches() {
|
||||
Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6);
|
||||
|
||||
Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6);
|
||||
|
||||
Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6);
|
||||
Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting.utils.activeregion;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class ActivityProfileUnitTest extends BaseTest {
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private GenomeLoc startLoc;
|
||||
|
||||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
startLoc = genomeLocParser.createGenomeLoc("chr1", 1, 1, 100);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Basic tests Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class BasicActivityProfileTestProvider extends TestDataProvider {
|
||||
List<Double> probs;
|
||||
List<ActiveRegion> expectedRegions;
|
||||
int extension = 0;
|
||||
GenomeLoc regionStart = startLoc;
|
||||
|
||||
public BasicActivityProfileTestProvider(final List<Double> probs, final List<ActiveRegion> expectedRegions) {
|
||||
super(BasicActivityProfileTestProvider.class);
|
||||
this.probs = probs;
|
||||
this.expectedRegions = expectedRegions;
|
||||
setName(getName());
|
||||
}
|
||||
|
||||
public BasicActivityProfileTestProvider(final List<Double> probs, boolean startActive, int ... startsAndStops) {
|
||||
super(BasicActivityProfileTestProvider.class);
|
||||
this.probs = probs;
|
||||
this.expectedRegions = toRegions(startActive, startsAndStops);
|
||||
setName(getName());
|
||||
}
|
||||
|
||||
private String getName() {
|
||||
return String.format("probs=%s expectedRegions=%s", Utils.join(",", probs), Utils.join(",", expectedRegions));
|
||||
}
|
||||
|
||||
private List<ActiveRegion> toRegions(boolean isActive, int[] startsAndStops) {
|
||||
List<ActiveRegion> l = new ArrayList<ActiveRegion>();
|
||||
for ( int i = 0; i < startsAndStops.length - 1; i++) {
|
||||
int start = regionStart.getStart() + startsAndStops[i];
|
||||
int end = regionStart.getStart() + startsAndStops[i+1] - 1;
|
||||
GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end);
|
||||
ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension);
|
||||
l.add(r);
|
||||
isActive = ! isActive;
|
||||
}
|
||||
return l;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BasicActivityProfileTestProvider")
|
||||
public Object[][] makeQualIntervalTestProvider() {
|
||||
new BasicActivityProfileTestProvider(Arrays.asList(1.0), true, 0, 1);
|
||||
new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0), true, 0, 1, 2);
|
||||
new BasicActivityProfileTestProvider(Arrays.asList(0.0, 1.0), false, 0, 1, 2);
|
||||
new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3);
|
||||
new BasicActivityProfileTestProvider(Arrays.asList(1.0, 1.0, 1.0), true, 0, 3);
|
||||
|
||||
return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicActivityProfileTestProvider")
|
||||
public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) {
|
||||
ActivityProfile profile = new ActivityProfile(genomeLocParser, false);
|
||||
|
||||
Assert.assertEquals(profile.parser, genomeLocParser);
|
||||
|
||||
for ( int i = 0; i < cfg.probs.size(); i++ ) {
|
||||
double p = cfg.probs.get(i);
|
||||
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
|
||||
profile.add(loc, p);
|
||||
}
|
||||
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));
|
||||
|
||||
Assert.assertEquals(profile.size(), cfg.probs.size());
|
||||
Assert.assertEquals(profile.isActiveList, cfg.probs);
|
||||
|
||||
assertRegionsAreEqual(profile.createActiveRegions(0), cfg.expectedRegions);
|
||||
}
|
||||
|
||||
private void assertRegionsAreEqual(List<ActiveRegion> actual, List<ActiveRegion> expected) {
|
||||
Assert.assertEquals(actual.size(), expected.size());
|
||||
for ( int i = 0; i < actual.size(); i++ ) {
|
||||
Assert.assertTrue(actual.get(i).equalExceptReads(expected.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
// todo -- test extensions
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Unit tests for on-the-fly recalibration.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 3/16/12
|
||||
*/
|
||||
public class BaseRecalibrationUnitTest {
|
||||
|
||||
@Test(enabled=true)
|
||||
public void testReadingCSV() {
|
||||
File csv = new File("public/testdata/exampleCSV.csv");
|
||||
BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
|
||||
System.out.println("Success");
|
||||
}
|
||||
}
|
||||
|
|
@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testReducedReadPileupElement() {
|
||||
PileupElement readp = new PileupElement(read, 0, false, false, false, false);
|
||||
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false);
|
||||
PileupElement readp = new PileupElement(read, 0, false, false, false, false, false, false);
|
||||
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false, false, false);
|
||||
|
||||
Assert.assertFalse(readp.getRead().isReducedRead());
|
||||
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ public class GenotypeLikelihoodsUnitTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testgetQualFromLikelihoods(){
|
||||
public void testgetQualFromLikelihoods() {
|
||||
double[] likelihoods = new double[]{-1, 0, -2};
|
||||
// qual values we expect for each possible "best" genotype
|
||||
double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294};
|
||||
|
|
@ -134,4 +134,33 @@ public class GenotypeLikelihoodsUnitTest {
|
|||
Assert.assertEquals(v1[i], v2[i], 1e-6);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCalculatePLindex(){
|
||||
int counter = 0;
|
||||
for ( int i = 0; i <= 3; i++ ) {
|
||||
for ( int j = i; j <= 3; j++ ) {
|
||||
Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), GenotypeLikelihoods.PLindexConversion[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetAllelePair(){
|
||||
allelePairTest(0, 0, 0);
|
||||
allelePairTest(1, 0, 1);
|
||||
allelePairTest(2, 1, 1);
|
||||
allelePairTest(3, 0, 2);
|
||||
allelePairTest(4, 1, 2);
|
||||
allelePairTest(5, 2, 2);
|
||||
allelePairTest(6, 0, 3);
|
||||
allelePairTest(7, 1, 3);
|
||||
allelePairTest(8, 2, 3);
|
||||
allelePairTest(9, 3, 3);
|
||||
}
|
||||
|
||||
private void allelePairTest(int PLindex, int allele1, int allele2) {
|
||||
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex1, allele1, "allele index " + allele1 + " from PL index " + PLindex + " was not calculated correctly");
|
||||
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex2, allele2, "allele index " + allele2 + " from PL index " + PLindex + " was not calculated correctly");
|
||||
}
|
||||
}
|
||||
|
|
@ -236,6 +236,16 @@ public class VariantContextUnitTest extends BaseTest {
|
|||
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMatchingAlleles() {
|
||||
List<Allele> alleles = Arrays.asList(ATCref, del);
|
||||
VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).referenceBaseForIndel((byte)'A').make();
|
||||
VariantContext vc2 = new VariantContextBuilder("test2", delLoc, delLocStart+12, delLocStop+12, alleles).referenceBaseForIndel((byte)'A').make();
|
||||
|
||||
Assert.assertTrue(vc.hasSameAllelesAs(vc2));
|
||||
Assert.assertTrue(vc.hasSameAlternateAllelesAs(vc2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreatingInsertionVariantContext() {
|
||||
List<Allele> alleles = Arrays.asList(delRef, ATC);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue