Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
b02ef95bcf
|
|
@ -613,7 +613,7 @@ public class GenomeAnalysisEngine {
|
||||||
*/
|
*/
|
||||||
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
|
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
|
||||||
|
|
||||||
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>(0);
|
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
|
||||||
for ( IntervalBinding intervalBinding : argList ) {
|
for ( IntervalBinding intervalBinding : argList ) {
|
||||||
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
|
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
|
||||||
|
|
||||||
public boolean filterOut(final SAMRecord rec) {
|
public boolean filterOut(final SAMRecord rec) {
|
||||||
Cigar c = rec.getCigar();
|
Cigar c = rec.getCigar();
|
||||||
boolean lastElementWasIndel = false;
|
boolean previousElementWasIndel = false;
|
||||||
for ( CigarElement ce : c.getCigarElements() ) {
|
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||||
if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
|
|
||||||
if ( lastElementWasIndel )
|
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
|
||||||
return true;
|
return true;
|
||||||
lastElementWasIndel = true;
|
|
||||||
} else {
|
for (CigarElement ce : c.getCigarElements()) {
|
||||||
lastElementWasIndel = false;
|
CigarOperator op = ce.getOperator();
|
||||||
|
if (op == CigarOperator.D || op == CigarOperator.I) {
|
||||||
|
if (previousElementWasIndel)
|
||||||
|
return true; // filter out reads with adjacent I/D
|
||||||
|
|
||||||
|
previousElementWasIndel = true;
|
||||||
}
|
}
|
||||||
|
else // this is a regular base (match/mismatch/hard or soft clip)
|
||||||
|
previousElementWasIndel = false; // reset the previous element
|
||||||
|
|
||||||
|
lastOp = op;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return lastOp == CigarOperator.D;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -179,6 +179,11 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
|
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CigarElement peekBackwardOnGenome() {
|
||||||
|
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public CigarOperator stepForwardOnGenome() {
|
public CigarOperator stepForwardOnGenome() {
|
||||||
// we enter this method with readOffset = index of the last processed base on the read
|
// we enter this method with readOffset = index of the last processed base on the read
|
||||||
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
|
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
|
||||||
|
|
@ -194,7 +199,7 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
return stepForwardOnGenome();
|
return stepForwardOnGenome();
|
||||||
} else {
|
} else {
|
||||||
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
if (curElement != null && curElement.getOperator() == CigarOperator.D)
|
||||||
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString());
|
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads ending in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||||
|
|
||||||
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
|
||||||
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
// we fall into this else block only when indels end the read, increment genomeOffset such that the
|
||||||
|
|
@ -231,7 +236,7 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
||||||
// past the insertion right after that
|
// past the insertion right after that
|
||||||
if (eventDelayedFlag > 1)
|
if (eventDelayedFlag > 1)
|
||||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||||
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
||||||
eventLength = curElement.getLength();
|
eventLength = curElement.getLength();
|
||||||
eventStart = readOffset;
|
eventStart = readOffset;
|
||||||
|
|
@ -244,13 +249,13 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
break;
|
break;
|
||||||
case D: // deletion w.r.t. the reference
|
case D: // deletion w.r.t. the reference
|
||||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString());
|
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||||
if (generateExtendedEvents) {
|
if (generateExtendedEvents) {
|
||||||
if (cigarElementCounter == 1) {
|
if (cigarElementCounter == 1) {
|
||||||
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
||||||
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
||||||
if (eventDelayedFlag > 1)
|
if (eventDelayedFlag > 1)
|
||||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||||
eventLength = curElement.getLength();
|
eventLength = curElement.getLength();
|
||||||
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
||||||
eventStart = readOffset;
|
eventStart = readOffset;
|
||||||
|
|
@ -401,24 +406,24 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
final SAMRecordState state = iterator.next();
|
final SAMRecordState state = iterator.next();
|
||||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||||
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
|
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
|
||||||
final int eventLength = state.getEventLength();
|
final int eventLength = state.getEventLength();
|
||||||
|
|
||||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
|
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
|
||||||
size++;
|
size++;
|
||||||
ExtendedEventPileupElement pileupElement;
|
ExtendedEventPileupElement pileupElement;
|
||||||
if (state.getEventBases() == null) { // Deletion event
|
if (state.getEventBases() == null) { // Deletion event
|
||||||
nDeletions++;
|
nDeletions++;
|
||||||
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
|
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
|
||||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
|
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
|
||||||
}
|
}
|
||||||
else { // Insertion event
|
else { // Insertion event
|
||||||
nInsertions++;
|
nInsertions++;
|
||||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
|
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
|
||||||
}
|
}
|
||||||
|
|
@ -442,10 +447,10 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
if (indelPile.size() != 0)
|
if (indelPile.size() != 0)
|
||||||
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
|
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
|
||||||
}
|
}
|
||||||
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
||||||
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
|
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
|
||||||
}
|
}
|
||||||
else { // this is a regular event pileup (not extended)
|
else { // this is a regular event pileup (not extended)
|
||||||
GenomeLoc location = getLocation();
|
GenomeLoc location = getLocation();
|
||||||
Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||||
boolean hasBeenSampled = false;
|
boolean hasBeenSampled = false;
|
||||||
|
|
@ -454,27 +459,34 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||||
|
|
||||||
size = 0; // number of elements in this sample's pileup
|
size = 0; // number of elements in this sample's pileup
|
||||||
nDeletions = 0; // number of deletions in this sample's pileup
|
nDeletions = 0; // number of deletions in this sample's pileup
|
||||||
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||||
final CigarOperator nextOp = nextElement.getOperator();
|
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
|
||||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
|
||||||
|
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
|
||||||
|
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||||
|
|
||||||
|
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
|
||||||
|
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
|
||||||
|
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
|
||||||
|
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
|
||||||
|
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
|
||||||
|
|
||||||
int nextElementLength = nextElement.getLength();
|
int nextElementLength = nextElement.getLength();
|
||||||
|
|
||||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (op == CigarOperator.D) {
|
if (op == CigarOperator.D) {
|
||||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||||
pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
|
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
|
||||||
null,nextOp == CigarOperator.D? nextElementLength:-1));
|
|
||||||
size++;
|
size++;
|
||||||
nDeletions++;
|
nDeletions++;
|
||||||
if (read.getMappingQuality() == 0)
|
if (read.getMappingQuality() == 0)
|
||||||
|
|
@ -484,11 +496,10 @@ public class LocusIteratorByState extends LocusIterator {
|
||||||
else {
|
else {
|
||||||
if (!filterBaseInRead(read, location.getStart())) {
|
if (!filterBaseInRead(read, location.getStart())) {
|
||||||
String insertedBaseString = null;
|
String insertedBaseString = null;
|
||||||
if (nextOp == CigarOperator.I) {
|
if (nextOp == CigarOperator.I)
|
||||||
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
|
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
|
||||||
}
|
|
||||||
pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.D, nextOp == CigarOperator.I, nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()),
|
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
|
||||||
insertedBaseString,nextElementLength));
|
|
||||||
size++;
|
size++;
|
||||||
if (read.getMappingQuality() == 0)
|
if (read.getMappingQuality() == 0)
|
||||||
nMQ0Reads++;
|
nMQ0Reads++;
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package org.broadinstitute.sting.gatk.iterators;
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
import net.sf.samtools.SAMFormatException;
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.util.CloseableIterator;
|
import net.sf.samtools.util.CloseableIterator;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
@ -23,7 +22,7 @@ public class MalformedBAMErrorReformatingIterator implements CloseableIterator<S
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
try {
|
try {
|
||||||
return this.it.hasNext();
|
return this.it.hasNext();
|
||||||
} catch ( SAMFormatException e ) {
|
} catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes
|
||||||
throw new UserException.MalformedBAM(source, e.getMessage());
|
throw new UserException.MalformedBAM(source, e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -31,7 +30,7 @@ public class MalformedBAMErrorReformatingIterator implements CloseableIterator<S
|
||||||
public SAMRecord next() {
|
public SAMRecord next() {
|
||||||
try {
|
try {
|
||||||
return it.next();
|
return it.next();
|
||||||
} catch ( SAMFormatException e ) {
|
} catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes
|
||||||
throw new UserException.MalformedBAM(source, e.getMessage());
|
throw new UserException.MalformedBAM(source, e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -418,7 +418,7 @@ public class RefMetaDataTracker {
|
||||||
* with the current site as a RODRecordList List object. If no data track with specified name is available,
|
* with the current site as a RODRecordList List object. If no data track with specified name is available,
|
||||||
* returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up
|
* returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up
|
||||||
* with track name set to 'name' and location set to null; otherwise the wrapper object will have name and
|
* with track name set to 'name' and location set to null; otherwise the wrapper object will have name and
|
||||||
* location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution,
|
* location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution,
|
||||||
* defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise:
|
* defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise:
|
||||||
* for instance, on locus traversal, location is usually expected to be a single base we are currently looking at,
|
* for instance, on locus traversal, location is usually expected to be a single base we are currently looking at,
|
||||||
* regardless of the presence of "extended" RODs overlapping with that location).
|
* regardless of the presence of "extended" RODs overlapping with that location).
|
||||||
|
|
|
||||||
|
|
@ -132,7 +132,7 @@ public class FeatureManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the FeatureDescriptor with getName().equals(name)
|
* Return the FeatureDescriptor with getID().equals(name)
|
||||||
*
|
*
|
||||||
* @param name
|
* @param name
|
||||||
* @return A FeatureDescriptor or null if none is found
|
* @return A FeatureDescriptor or null if none is found
|
||||||
|
|
|
||||||
|
|
@ -41,10 +41,10 @@ import java.util.TreeMap;
|
||||||
public class GATKReport {
|
public class GATKReport {
|
||||||
public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport.";
|
public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport.";
|
||||||
public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_0;
|
public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_0;
|
||||||
public static final String SEPARATOR = ":";
|
private static final String SEPARATOR = ":";
|
||||||
private GATKReportVersion version = LATEST_REPORT_VERSION;
|
private GATKReportVersion version = LATEST_REPORT_VERSION;
|
||||||
|
|
||||||
private TreeMap<String, GATKReportTable> tables = new TreeMap<String, GATKReportTable>();
|
private final TreeMap<String, GATKReportTable> tables = new TreeMap<String, GATKReportTable>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new, empty GATKReport.
|
* Create a new, empty GATKReport.
|
||||||
|
|
@ -70,6 +70,15 @@ public class GATKReport {
|
||||||
loadReport(file);
|
loadReport(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GATK report from GATK report tables
|
||||||
|
* @param tables Any number of tables that you want ot add to the report
|
||||||
|
*/
|
||||||
|
public GATKReport(GATKReportTable... tables) {
|
||||||
|
for( GATKReportTable table: tables)
|
||||||
|
addTable(table);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load a GATKReport file from disk
|
* Load a GATKReport file from disk
|
||||||
*
|
*
|
||||||
|
|
@ -202,10 +211,6 @@ public class GATKReport {
|
||||||
return version;
|
return version;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setVersion(GATKReportVersion version) {
|
|
||||||
this.version = version;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything
|
* Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything
|
||||||
* in between. This does not check if the data inside is the same. This is the check to see if the two reports are
|
* in between. This does not check if the data inside is the same. This is the check to see if the two reports are
|
||||||
|
|
|
||||||
|
|
@ -199,7 +199,7 @@ public class GATKReportColumn extends TreeMap<Object, Object> {
|
||||||
defaultValue.equals(that.defaultValue) );
|
defaultValue.equals(that.defaultValue) );
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean equals(GATKReportColumn that) {
|
boolean equals(GATKReportColumn that) {
|
||||||
if ( !this.keySet().equals(that.keySet()) ) {
|
if ( !this.keySet().equals(that.keySet()) ) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2011, The Broad Institute
|
* Copyright (c) 2012, The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
|
|
@ -29,8 +29,8 @@ package org.broadinstitute.sting.gatk.report;
|
||||||
*/
|
*/
|
||||||
public class GATKReportColumnFormat {
|
public class GATKReportColumnFormat {
|
||||||
public static enum Alignment { LEFT, RIGHT }
|
public static enum Alignment { LEFT, RIGHT }
|
||||||
public int width;
|
private final int width;
|
||||||
public Alignment alignment;
|
private final Alignment alignment;
|
||||||
|
|
||||||
public GATKReportColumnFormat(int width, Alignment alignment) {
|
public GATKReportColumnFormat(int width, Alignment alignment) {
|
||||||
this.width = width;
|
this.width = width;
|
||||||
|
|
|
||||||
|
|
@ -24,13 +24,15 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.report;
|
package org.broadinstitute.sting.gatk.report;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tracks a linked list of GATKReportColumn in order by name.
|
* Tracks a linked list of GATKReportColumn in order by name.
|
||||||
*/
|
*/
|
||||||
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
|
public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> implements Iterable<GATKReportColumn> {
|
||||||
private List<String> columnNames = new ArrayList<String>();
|
private final List<String> columnNames = new ArrayList<String>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the column by index
|
* Returns the column by index
|
||||||
|
|
@ -43,9 +45,12 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> i
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public GATKReportColumn remove(Object key) {
|
public GATKReportColumn remove(Object columnName) {
|
||||||
columnNames.remove(key);
|
if ( !(columnName instanceof String) ) {
|
||||||
return super.remove(key);
|
throw new ReviewedStingException("The column name must be a String!");
|
||||||
|
}
|
||||||
|
columnNames.remove(columnName.toString());
|
||||||
|
return super.remove(columnName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -85,7 +90,7 @@ public class GATKReportColumns extends LinkedHashMap<String, GATKReportColumn> i
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean equals(GATKReportColumns that) {
|
boolean equals(GATKReportColumns that) {
|
||||||
for (Map.Entry<String, GATKReportColumn> pair : entrySet()) {
|
for (Map.Entry<String, GATKReportColumn> pair : entrySet()) {
|
||||||
// Make sure that every column is the same, we know that the # of columns
|
// Make sure that every column is the same, we know that the # of columns
|
||||||
// is the same from isSameFormat()
|
// is the same from isSameFormat()
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ public enum GATKReportDataType {
|
||||||
*/
|
*/
|
||||||
String("%[Ss]");
|
String("%[Ss]");
|
||||||
|
|
||||||
public final String dataTypeString;
|
private final String dataTypeString;
|
||||||
|
|
||||||
private GATKReportDataType(String dataTypeString) {
|
private GATKReportDataType(String dataTypeString) {
|
||||||
this.dataTypeString = dataTypeString;
|
this.dataTypeString = dataTypeString;
|
||||||
|
|
@ -189,7 +189,7 @@ public enum GATKReportDataType {
|
||||||
* @param obj The input string
|
* @param obj The input string
|
||||||
* @return an object that matches the data type.
|
* @return an object that matches the data type.
|
||||||
*/
|
*/
|
||||||
protected Object Parse(Object obj) {
|
Object Parse(Object obj) {
|
||||||
if (obj instanceof String) {
|
if (obj instanceof String) {
|
||||||
String str = obj.toString();
|
String str = obj.toString();
|
||||||
switch (this) {
|
switch (this) {
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,27 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.report;
|
package org.broadinstitute.sting.gatk.report;
|
||||||
|
|
||||||
import org.broadinstitute.sting.commandline.Gatherer;
|
import org.broadinstitute.sting.commandline.Gatherer;
|
||||||
|
|
@ -8,13 +32,6 @@ import java.io.FileNotFoundException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: roger
|
|
||||||
* Date: 1/9/12
|
|
||||||
* Time: 11:17 PM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
public class GATKReportGatherer extends Gatherer {
|
public class GATKReportGatherer extends Gatherer {
|
||||||
@Override
|
@Override
|
||||||
public void gather(List<File> inputs, File output) {
|
public void gather(List<File> inputs, File output) {
|
||||||
|
|
|
||||||
|
|
@ -34,97 +34,14 @@ import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
|
||||||
* A data structure that allows data to be collected over the course of a walker's computation, then have that data
|
|
||||||
* written to a PrintStream such that it's human-readable, AWK-able, and R-friendly (given that you load it using the
|
|
||||||
* GATKReport loader module).
|
|
||||||
* <p/>
|
|
||||||
* The goal of this object is to use the same data structure for both accumulating data during a walker's computation
|
|
||||||
* and emitting that data to a file for easy analysis in R (or any other program/language that can take in a table of
|
|
||||||
* results). Thus, all of the infrastructure below is designed simply to make printing the following as easy as
|
|
||||||
* possible:
|
|
||||||
* <p/>
|
|
||||||
* ##:GATKReport.v0.1 ErrorRatePerCycle : The error rate per sequenced position in the reads
|
|
||||||
* cycle errorrate.61PA8.7 qualavg.61PA8.7
|
|
||||||
* 0 0.007451835696110506 25.474613284804366
|
|
||||||
* 1 0.002362777171937477 29.844949954504095
|
|
||||||
* 2 9.087604507451836E-4 32.87590975254731
|
|
||||||
* 3 5.452562704471102E-4 34.498999090081895
|
|
||||||
* 4 9.087604507451836E-4 35.14831665150137
|
|
||||||
* 5 5.452562704471102E-4 36.07223435225619
|
|
||||||
* 6 5.452562704471102E-4 36.1217248908297
|
|
||||||
* 7 5.452562704471102E-4 36.1910480349345
|
|
||||||
* 8 5.452562704471102E-4 36.00345705967977
|
|
||||||
* <p/>
|
|
||||||
* Here, we have a GATKReport table - a well-formatted, easy to read representation of some tabular data. Every single
|
|
||||||
* table has this same GATKReport.v0.1 header, which permits multiple files from different sources to be cat-ed
|
|
||||||
* together, which makes it very easy to pull tables from different programs into R via a single file.
|
|
||||||
* <p/>
|
|
||||||
* ------------
|
|
||||||
* Definitions:
|
|
||||||
* <p/>
|
|
||||||
* Table info:
|
|
||||||
* The first line, structured as
|
|
||||||
* ##:<report version> <table name> : <table description>
|
|
||||||
* <p/>
|
|
||||||
* Table header:
|
|
||||||
* The second line, specifying a unique name for each column in the table.
|
|
||||||
* <p/>
|
|
||||||
* The first column mentioned in the table header is the "primary key" column - a column that provides the unique
|
|
||||||
* identifier for each row in the table. Once this column is created, any element in the table can be referenced by
|
|
||||||
* the row-column coordinate, i.e. "primary key"-"column name" coordinate.
|
|
||||||
* <p/>
|
|
||||||
* When a column is added to a table, a default value must be specified (usually 0). This is the initial value for
|
|
||||||
* an element in a column. This permits operations like increment() and decrement() to work properly on columns that
|
|
||||||
* are effectively counters for a particular event.
|
|
||||||
* <p/>
|
|
||||||
* Finally, the display property for each column can be set during column creation. This is useful when a given
|
|
||||||
* column stores an intermediate result that will be used later on, perhaps to calculate the value of another column.
|
|
||||||
* In these cases, it's obviously necessary to store the value required for further computation, but it's not
|
|
||||||
* necessary to actually print the intermediate column.
|
|
||||||
* <p/>
|
|
||||||
* Table body:
|
|
||||||
* The values of the table itself.
|
|
||||||
* <p/>
|
|
||||||
* ---------------
|
|
||||||
* Implementation:
|
|
||||||
* <p/>
|
|
||||||
* The implementation of this table has two components:
|
|
||||||
* 1. A TreeSet<Object> that stores all the values ever specified for the primary key. Any get() operation that
|
|
||||||
* refers to an element where the primary key object does not exist will result in its implicit creation. I
|
|
||||||
* haven't yet decided if this is a good idea...
|
|
||||||
* <p/>
|
|
||||||
* 2. A HashMap<String, GATKReportColumn> that stores a mapping from column name to column contents. Each
|
|
||||||
* GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap<Object, Object>) between
|
|
||||||
* primary key and the column value. This means that, given N columns, the primary key information is stored
|
|
||||||
* N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations.
|
|
||||||
* <p/>
|
|
||||||
* ------------------------------
|
|
||||||
* Element and column operations:
|
|
||||||
* <p/>
|
|
||||||
* In addition to simply getting and setting values, this object also permits some simple operations to be applied to
|
|
||||||
* individual elements or to whole columns. For instance, an element can be easily incremented without the hassle of
|
|
||||||
* calling get(), incrementing the obtained value by 1, and then calling set() with the new value. Also, some vector
|
|
||||||
* operations are supported. For instance, two whole columns can be divided and have the result be set to a third
|
|
||||||
* column. This is especially useful when aggregating counts in two intermediate columns that will eventually need to
|
|
||||||
* be manipulated row-by-row to compute the final column.
|
|
||||||
* <p/>
|
|
||||||
* Note: I've made no attempt whatsoever to make these operations efficient. Right now, some of the methods check the
|
|
||||||
* type of the stored object using an instanceof call and attempt to do the right thing. Others cast the contents of
|
|
||||||
* the cell to a Number, call the Number.toDouble() method and compute a result. This is clearly not the ideal design,
|
|
||||||
* but at least the prototype contained herein works.
|
|
||||||
*
|
|
||||||
* @author Kiran Garimella
|
|
||||||
* @author Khalid Shakir
|
|
||||||
*/
|
|
||||||
public class GATKReportTable {
|
public class GATKReportTable {
|
||||||
/**
|
/**
|
||||||
* REGEX that matches any table with an invalid name
|
* REGEX that matches any table with an invalid name
|
||||||
*/
|
*/
|
||||||
public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
|
public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
|
||||||
public static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable";
|
private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable";
|
||||||
public static final String SEPARATOR = ":";
|
private static final String SEPARATOR = ":";
|
||||||
public static final String ENDLINE = ":;";
|
private static final String ENDLINE = ":;";
|
||||||
|
|
||||||
private String tableName;
|
private String tableName;
|
||||||
private String tableDescription;
|
private String tableDescription;
|
||||||
|
|
@ -418,8 +335,8 @@ public class GATKReportTable {
|
||||||
* output file), and the format string used to display the data.
|
* output file), and the format string used to display the data.
|
||||||
*
|
*
|
||||||
* @param columnName the name of the column
|
* @param columnName the name of the column
|
||||||
* @param defaultValue if true - the column will be displayed; if false - the column will be hidden
|
* @param defaultValue the default value of a blank cell
|
||||||
* @param display
|
* @param display if true - the column will be displayed; if false - the column will be hidden
|
||||||
* @param format the format string used to display data
|
* @param format the format string used to display data
|
||||||
*/
|
*/
|
||||||
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {
|
public void addColumn(String columnName, Object defaultValue, boolean display, String format) {
|
||||||
|
|
@ -429,12 +346,6 @@ public class GATKReportTable {
|
||||||
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format));
|
columns.put(columnName, new GATKReportColumn(columnName, defaultValue, display, format));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public GATKReportVersion getVersion() {
|
|
||||||
return GATKReport.LATEST_REPORT_VERSION;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if the requested element exists, and if not, create it.
|
* Check if the requested element exists, and if not, create it.
|
||||||
*
|
*
|
||||||
|
|
@ -508,8 +419,7 @@ public class GATKReportTable {
|
||||||
value = newValue;
|
value = newValue;
|
||||||
|
|
||||||
if (column.getDataType().equals(GATKReportDataType.fromObject(value)) ||
|
if (column.getDataType().equals(GATKReportDataType.fromObject(value)) ||
|
||||||
column.getDataType().equals(GATKReportDataType.Unknown) ||
|
column.getDataType().equals(GATKReportDataType.Unknown) )
|
||||||
value == null)
|
|
||||||
columns.get(columnName).put(primaryKey, value);
|
columns.get(columnName).put(primaryKey, value);
|
||||||
else
|
else
|
||||||
throw new ReviewedStingException(String.format("Tried to add an object of type: %s to a column of type: %s",
|
throw new ReviewedStingException(String.format("Tried to add an object of type: %s to a column of type: %s",
|
||||||
|
|
@ -795,7 +705,7 @@ public class GATKReportTable {
|
||||||
*
|
*
|
||||||
* @return the width of the primary key column
|
* @return the width of the primary key column
|
||||||
*/
|
*/
|
||||||
public int getPrimaryKeyColumnWidth() {
|
int getPrimaryKeyColumnWidth() {
|
||||||
int maxWidth = getPrimaryKeyName().length();
|
int maxWidth = getPrimaryKeyName().length();
|
||||||
|
|
||||||
for (Object primaryKey : primaryKeyColumn) {
|
for (Object primaryKey : primaryKeyColumn) {
|
||||||
|
|
@ -814,7 +724,7 @@ public class GATKReportTable {
|
||||||
*
|
*
|
||||||
* @param out the PrintStream to which the table should be written
|
* @param out the PrintStream to which the table should be written
|
||||||
*/
|
*/
|
||||||
public void write(PrintStream out) {
|
void write(PrintStream out) {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Table header:
|
* Table header:
|
||||||
|
|
@ -912,7 +822,7 @@ public class GATKReportTable {
|
||||||
*
|
*
|
||||||
* @param input Another GATK table
|
* @param input Another GATK table
|
||||||
*/
|
*/
|
||||||
protected void combineWith(GATKReportTable input) {
|
void combineWith(GATKReportTable input) {
|
||||||
/*
|
/*
|
||||||
* This function is different from addRowsFrom because we will add the ability to sum,average, etc rows
|
* This function is different from addRowsFrom because we will add the ability to sum,average, etc rows
|
||||||
* TODO: Add other combining algorithms
|
* TODO: Add other combining algorithms
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2011, The Broad Institute
|
* Copyright (c) 2012, The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
|
|
@ -50,7 +50,7 @@ public enum GATKReportVersion {
|
||||||
*/
|
*/
|
||||||
V1_0("v1.0");
|
V1_0("v1.0");
|
||||||
|
|
||||||
public final String versionString;
|
private final String versionString;
|
||||||
|
|
||||||
private GATKReportVersion(String versionString) {
|
private GATKReportVersion(String versionString) {
|
||||||
this.versionString = versionString;
|
this.versionString = versionString;
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package org.broadinstitute.sting.gatk.traversals;
|
package org.broadinstitute.sting.gatk.traversals;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
|
@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||||
|
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
|
@ -42,38 +44,31 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
|
||||||
|
|
||||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||||
final GenomeLocSortedSet initialIntervals = engine.getIntervals(); // BUGBUG: unfortunate inefficiency that needs to be removed
|
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
|
||||||
|
|
||||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||||
|
|
||||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||||
|
|
||||||
int minStart = Integer.MAX_VALUE;
|
int minStart = Integer.MAX_VALUE;
|
||||||
final ArrayList<Double> isActiveList = new ArrayList<Double>();
|
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||||
GenomeLoc firstIsActiveStart = null;
|
|
||||||
|
|
||||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||||
ReferenceOrderedView referenceOrderedDataView = null;
|
|
||||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
|
||||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
|
||||||
else
|
|
||||||
referenceOrderedDataView = (RodLocusView)locusView;
|
|
||||||
|
|
||||||
// We keep processing while the next reference location is within the interval
|
// We keep processing while the next reference location is within the interval
|
||||||
GenomeLoc prevLoc = null;
|
GenomeLoc prevLoc = null;
|
||||||
while( locusView.hasNext() ) {
|
while( locusView.hasNext() ) {
|
||||||
final AlignmentContext locus = locusView.next();
|
final AlignmentContext locus = locusView.next();
|
||||||
GenomeLoc location = locus.getLocation();
|
GenomeLoc location = locus.getLocation();
|
||||||
|
|
||||||
if(prevLoc != null) {
|
if(prevLoc != null) {
|
||||||
for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) {
|
// fill in the active / inactive labels from the stop of the previous location to the start of this location
|
||||||
|
// TODO refactor to separate function
|
||||||
|
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
|
||||||
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
|
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
|
||||||
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
|
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
|
||||||
final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) );
|
final double isActiveProb = ( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 );
|
||||||
isActiveList.add( isActiveProb );
|
profile.add(fakeLoc, isActiveProb);
|
||||||
if( firstIsActiveStart == null ) {
|
|
||||||
firstIsActiveStart = fakeLoc;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -89,12 +84,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
|
|
||||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||||
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
|
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
|
||||||
final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus )
|
final double isActiveProb = walkerActiveProb(walker, tracker, refContext, locus, location);
|
||||||
: ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) );
|
profile.add(location, isActiveProb);
|
||||||
isActiveList.add( isActiveProb );
|
|
||||||
if( firstIsActiveStart == null ) {
|
|
||||||
firstIsActiveStart = location;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||||
|
|
@ -103,52 +94,100 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
if( !myReads.contains(read) ) {
|
if( !myReads.contains(read) ) {
|
||||||
myReads.add(read);
|
myReads.add(read);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||||
|
// which active regions in the work queue are now safe to process
|
||||||
|
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||||
}
|
}
|
||||||
|
|
||||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
|
||||||
// which active regions in the work queue are now safe to process
|
|
||||||
if( !locusView.hasNext() ) {
|
|
||||||
for( final PileupElement p : locus.getBasePileup() ) {
|
|
||||||
final GATKSAMRecord read = p.getRead();
|
|
||||||
if( !myReads.contains(read) ) {
|
|
||||||
myReads.add(read);
|
|
||||||
}
|
|
||||||
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prevLoc = location;
|
prevLoc = location;
|
||||||
|
|
||||||
printProgress(dataProvider.getShard(), locus.getLocation());
|
printProgress(dataProvider.getShard(), locus.getLocation());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||||
// add these blocks of work to the work queue
|
// add these blocks of work to the work queue
|
||||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null );
|
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||||
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||||
if( walker.activeRegionOutStream == null ) {
|
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension );
|
||||||
workQueue.addAll( activeRegions );
|
|
||||||
} else { // Just want to output the active regions to a file, not actually process them
|
|
||||||
for( final ActiveRegion activeRegion : activeRegions ) {
|
|
||||||
if( activeRegion.isActive ) {
|
|
||||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
// add active regions to queue of regions to process
|
||||||
while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
|
workQueue.addAll( activeRegions );
|
||||||
final ActiveRegion activeRegion = workQueue.remove();
|
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
|
||||||
}
|
// now go and process all of the active regions
|
||||||
|
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special function called in LinearMicroScheduler to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
|
|
||||||
public T endTraversal( final Walker<M,T> walker, T sum) {
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// simple utility functions
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private final double walkerActiveProb(final ActiveRegionWalker<M,T> walker,
|
||||||
|
final RefMetaDataTracker tracker, final ReferenceContext refContext,
|
||||||
|
final AlignmentContext locus, final GenomeLoc location) {
|
||||||
|
if ( walker.hasPresetActiveRegions() ) {
|
||||||
|
return walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0;
|
||||||
|
} else {
|
||||||
|
return walker.isActive( tracker, refContext, locus );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker<M,T> walker,
|
||||||
|
final LocusShardDataProvider dataProvider,
|
||||||
|
final LocusView locusView) {
|
||||||
|
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||||
|
return new ManagingReferenceOrderedView( dataProvider );
|
||||||
|
else
|
||||||
|
return (RodLocusView)locusView;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// code to handle processing active regions
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
|
||||||
|
if( walker.activeRegionOutStream != null ) {
|
||||||
|
writeActiveRegionsToStream(walker);
|
||||||
|
return sum;
|
||||||
|
} else {
|
||||||
|
return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write out each active region to the walker activeRegionOutStream
|
||||||
|
*
|
||||||
|
* @param walker
|
||||||
|
*/
|
||||||
|
private void writeActiveRegionsToStream( final ActiveRegionWalker<M,T> walker ) {
|
||||||
|
// Just want to output the active regions to a file, not actually process them
|
||||||
|
for( final ActiveRegion activeRegion : workQueue ) {
|
||||||
|
if( activeRegion.isActive ) {
|
||||||
|
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
|
||||||
|
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||||
|
// TODO can implement parallel traversal here
|
||||||
while( workQueue.peek() != null ) {
|
while( workQueue.peek() != null ) {
|
||||||
final ActiveRegion activeRegion = workQueue.remove();
|
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
|
||||||
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker<M,T>) walker );
|
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
|
||||||
|
final ActiveRegion activeRegion = workQueue.remove();
|
||||||
|
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
|
|
@ -193,6 +232,12 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
return walker.reduce( x, sum );
|
return walker.reduce( x, sum );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// engine interaction code
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the best view of loci for this walker given the available data.
|
* Gets the best view of loci for this walker given the available data.
|
||||||
* @param walker walker to interrogate.
|
* @param walker walker to interrogate.
|
||||||
|
|
@ -211,48 +256,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||||
}
|
}
|
||||||
|
|
||||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
/**
|
||||||
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) {
|
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||||
|
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||||
final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author
|
*/
|
||||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
public T endTraversal( final Walker<M,T> walker, T sum) {
|
||||||
if( activeList.size() == 0 ) {
|
return processActiveRegions((ActiveRegionWalker<M,T>)walker, sum, Integer.MAX_VALUE, null);
|
||||||
return returnList;
|
|
||||||
} else if( activeList.size() == 1 ) {
|
|
||||||
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()),
|
|
||||||
activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) );
|
|
||||||
return returnList;
|
|
||||||
} else {
|
|
||||||
final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
|
|
||||||
final double[] filteredProbArray = new double[activeProbArray.length];
|
|
||||||
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author
|
|
||||||
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author
|
|
||||||
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
|
||||||
double maxVal = 0;
|
|
||||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
|
||||||
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
|
||||||
}
|
|
||||||
filteredProbArray[iii] = maxVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD;
|
|
||||||
int curStart = 0;
|
|
||||||
for(int iii = 1; iii < filteredProbArray.length; iii++ ) {
|
|
||||||
final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD;
|
|
||||||
if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
|
|
||||||
returnList.add( new ActiveRegion(
|
|
||||||
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)),
|
|
||||||
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
|
|
||||||
curStatus = thisStatus;
|
|
||||||
curStart = iii;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( curStart != filteredProbArray.length-1 ) {
|
|
||||||
returnList.add( new ActiveRegion(
|
|
||||||
engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)),
|
|
||||||
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
|
|
||||||
}
|
|
||||||
return returnList;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,10 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
|
||||||
|
|
||||||
public GenomeLocSortedSet presetActiveRegions = null;
|
public GenomeLocSortedSet presetActiveRegions = null;
|
||||||
|
|
||||||
|
public boolean hasPresetActiveRegions() {
|
||||||
|
return presetActiveRegions != null;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if( activeRegionBindings == null ) { return; }
|
if( activeRegionBindings == null ) { return; }
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
||||||
if (!vc.isBiallelic()) {
|
if (!vc.isBiallelic()) {
|
||||||
// for non-bliallelic case, do test with most common alt allele.
|
// for non-bliallelic case, do test with most common alt allele.
|
||||||
// Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
|
// Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
|
||||||
int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount());
|
int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount());
|
||||||
idxAA = idxVector[0];
|
idxAA = idxVector[0];
|
||||||
idxAB = idxVector[1];
|
idxAB = idxVector[1];
|
||||||
idxBB = idxVector[2];
|
idxBB = idxVector[2];
|
||||||
|
|
|
||||||
|
|
@ -31,8 +31,10 @@ public class LowMQ extends InfoFieldAnnotation {
|
||||||
double total = 0;
|
double total = 0;
|
||||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
|
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
|
||||||
{
|
{
|
||||||
ReadBackedPileup pileup = sample.getValue().getBasePileup();
|
if ( !sample.getValue().hasBasePileup() )
|
||||||
for (PileupElement p : pileup )
|
continue;
|
||||||
|
|
||||||
|
for ( PileupElement p : sample.getValue().getBasePileup() )
|
||||||
{
|
{
|
||||||
if ( p.getMappingQual() == 0 ) { mq0 += 1; }
|
if ( p.getMappingQual() == 0 ) { mq0 += 1; }
|
||||||
if ( p.getMappingQual() <= 10 ) { mq10 += 1; }
|
if ( p.getMappingQual() <= 10 ) { mq10 += 1; }
|
||||||
|
|
|
||||||
|
|
@ -240,7 +240,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
||||||
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
|
for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) {
|
||||||
if ( line instanceof VCFInfoHeaderLine ) {
|
if ( line instanceof VCFInfoHeaderLine ) {
|
||||||
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
|
VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line;
|
||||||
if ( infoline.getName().equals(expression.fieldName) ) {
|
if ( infoline.getID().equals(expression.fieldName) ) {
|
||||||
targetHeaderLine = infoline;
|
targetHeaderLine = infoline;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,124 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Gatherer;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatumOptimized;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: carneiro
|
||||||
|
* Date: 3/29/11
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
public class BQSRGatherer extends Gatherer {
|
||||||
|
|
||||||
|
/////////////////////////////
|
||||||
|
// Private Member Variables
|
||||||
|
/////////////////////////////
|
||||||
|
private static final String EOF_MARKER = "EOF";
|
||||||
|
|
||||||
|
private HashMap<String, RecalDatumOptimized> dataMap = new HashMap<String, RecalDatumOptimized>();
|
||||||
|
|
||||||
|
|
||||||
|
private void addCSVData (String line) {
|
||||||
|
String[] covariates = line.split(",");
|
||||||
|
String key = "";
|
||||||
|
RecalDatumOptimized values;
|
||||||
|
|
||||||
|
for (int i = 0; i < covariates.length-3; i++)
|
||||||
|
key += covariates[i] + ",";
|
||||||
|
|
||||||
|
if (covariates.length < 3)
|
||||||
|
throw new ReviewedStingException("Line only has 1 covariate : " + line);
|
||||||
|
|
||||||
|
values = new RecalDatumOptimized(Long.parseLong(covariates[covariates.length - 3]), Long.parseLong(covariates[covariates.length - 2]));
|
||||||
|
|
||||||
|
RecalDatumOptimized currentValues = dataMap.get(key);
|
||||||
|
if (currentValues == null)
|
||||||
|
dataMap.put(key, values);
|
||||||
|
else
|
||||||
|
currentValues.increment(values);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void gather(List<File> inputs, File output) {
|
||||||
|
PrintStream o;
|
||||||
|
try {
|
||||||
|
o = new PrintStream(output);
|
||||||
|
} catch ( FileNotFoundException e) {
|
||||||
|
throw new UserException("File to be output by CountCovariates Gather function was not found");
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean sawEOF = false;
|
||||||
|
boolean printedHeader = false;
|
||||||
|
|
||||||
|
// Read input files
|
||||||
|
for ( File RECAL_FILE : inputs) {
|
||||||
|
try {
|
||||||
|
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||||
|
if ( EOF_MARKER.equals(line) ) {
|
||||||
|
sawEOF = true; // sanity check
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if(line.startsWith("#")) {
|
||||||
|
if (!printedHeader)
|
||||||
|
o.println(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
else // Found a line of data
|
||||||
|
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !sawEOF ) {
|
||||||
|
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
|
||||||
|
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||||
|
}
|
||||||
|
printedHeader = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write output file from dataMap
|
||||||
|
for(Map.Entry<String, RecalDatumOptimized> entry : dataMap.entrySet())
|
||||||
|
o.println(entry.getKey() + entry.getValue().outputToCSV());
|
||||||
|
o.println("EOF");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,284 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.BitSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class provides all the functionality for the BitSet representation of the keys to the hash table of BQSR
|
||||||
|
*
|
||||||
|
* It also handles the event type "covariate" which is not exactly a covariate, but is added as a key to the hashmap. The Key Manager will
|
||||||
|
* add the event type as a bitset to the end of the covariate bitset key. This way, it won't get int the way of masking the information
|
||||||
|
* out of the key for the actual covariates, and having the covariates handle it. The key manager handles the event type.
|
||||||
|
*
|
||||||
|
* The keys represented by this key manager will always have the same order:
|
||||||
|
*
|
||||||
|
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate1, OptionalCovariateID, EventType
|
||||||
|
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariate2, OptionalCovariateID, EventType
|
||||||
|
* ...
|
||||||
|
* RequiredCovariate1, RequiredCovariate2, ..., RequiredCovariateN, OptionalCovariateN, OptionalCovariateID, EventType
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Note that Optional Covariates are optional, and the Key Manager should operate without them if necessary.
|
||||||
|
*
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/6/12
|
||||||
|
*/
|
||||||
|
public class BQSRKeyManager {
|
||||||
|
private List<RequiredCovariateInfo> requiredCovariates;
|
||||||
|
private List<OptionalCovariateInfo> optionalCovariates;
|
||||||
|
|
||||||
|
private int nRequiredBits; // Number of bits used to represent the required covariates
|
||||||
|
private int nOptionalBits; // Number of bits used to represent the standard covaraites
|
||||||
|
private int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
|
||||||
|
private int totalNumberOfBits; // Sum of all of the above plus the event bits
|
||||||
|
|
||||||
|
private BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
|
||||||
|
private BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes the KeyManager with the total number of covariates to use
|
||||||
|
*
|
||||||
|
* @param requiredCovariates the ordered list of required covariates
|
||||||
|
* @param optionalCovariates the ordered list of optional covariates
|
||||||
|
*/
|
||||||
|
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
|
||||||
|
this.requiredCovariates = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
|
||||||
|
this.optionalCovariates = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
|
||||||
|
|
||||||
|
nRequiredBits = 0;
|
||||||
|
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
|
||||||
|
int nBits = required.numberOfBits(); // number of bits used by this covariate
|
||||||
|
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
|
||||||
|
this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, nBits, mask, required)); // Create an object for this required covariate
|
||||||
|
nRequiredBits += nBits;
|
||||||
|
}
|
||||||
|
|
||||||
|
short i = 0;
|
||||||
|
nOptionalBits = 0;
|
||||||
|
for (Covariate optional : optionalCovariates) {
|
||||||
|
int nBits = optional.numberOfBits(); // number of bits used by this covariate
|
||||||
|
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
|
||||||
|
BitSet optionalID = BitSetUtils.bitSetFrom(i); // calculate the optional covariate ID for this covariate
|
||||||
|
this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
|
||||||
|
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
|
||||||
|
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
|
||||||
|
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates one key per optional covariate.
|
||||||
|
*
|
||||||
|
* Keys include all required covariates, the standard covariate and the event type.
|
||||||
|
*
|
||||||
|
* Example allKeys:
|
||||||
|
* RG, QUAL, CYCLE, CONTEXT
|
||||||
|
*
|
||||||
|
* List of BitSets returned by this example (given eventType):
|
||||||
|
* RG, QUAL, CYCLE, EVENT
|
||||||
|
* RG, QUAL, CONTEXT, EVENT
|
||||||
|
*
|
||||||
|
* Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type
|
||||||
|
*
|
||||||
|
* @param allKeys The keys in bitset representation for each covariate
|
||||||
|
* @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
|
||||||
|
* @return one key in bitset representation per covariate
|
||||||
|
*/
|
||||||
|
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
|
||||||
|
List<BitSet> allBitSets = new LinkedList<BitSet>(); // Generate one key per optional covariate
|
||||||
|
|
||||||
|
BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type
|
||||||
|
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
|
||||||
|
|
||||||
|
int covariateIndex = 0;
|
||||||
|
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
|
||||||
|
for (RequiredCovariateInfo infoRequired : requiredCovariates)
|
||||||
|
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
|
||||||
|
|
||||||
|
for (OptionalCovariateInfo infoOptional : optionalCovariates) {
|
||||||
|
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
|
||||||
|
if (covariateKey == null)
|
||||||
|
continue; // do not add nulls to the final set of keys.
|
||||||
|
|
||||||
|
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
|
||||||
|
optionalKey.or(requiredKey); // import all the required covariates
|
||||||
|
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
|
||||||
|
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||||
|
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||||
|
allBitSets.add(optionalKey); // add this key to the list of keys
|
||||||
|
}
|
||||||
|
|
||||||
|
if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
|
||||||
|
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
|
||||||
|
allBitSets.add(requiredKey); // add this key to the list of keys
|
||||||
|
}
|
||||||
|
|
||||||
|
return allBitSets;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates one bitset key for the covariates represented in Object[] key
|
||||||
|
*
|
||||||
|
* The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file)
|
||||||
|
* and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many.
|
||||||
|
*
|
||||||
|
* Example key:
|
||||||
|
* RG, QUAL, CYCLE, CYCLE_ID, EventType
|
||||||
|
*
|
||||||
|
* @param key list of objects produced by the required covariates followed by one or zero optional covariates.
|
||||||
|
* @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface.
|
||||||
|
*/
|
||||||
|
public BitSet bitSetFromKey(Object[] key) {
|
||||||
|
BitSet bitSetKey = new BitSet(totalNumberOfBits);
|
||||||
|
|
||||||
|
int requiredCovariate = 0;
|
||||||
|
for (RequiredCovariateInfo infoRequired : requiredCovariates) {
|
||||||
|
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
|
||||||
|
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
|
||||||
|
}
|
||||||
|
|
||||||
|
if (optionalCovariates.size() > 0) {
|
||||||
|
int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array
|
||||||
|
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
|
||||||
|
int covariateID = (Short) key[covariateIDIndex]; // get the optional covariate id
|
||||||
|
OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information
|
||||||
|
|
||||||
|
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
|
||||||
|
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
|
||||||
|
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
|
||||||
|
}
|
||||||
|
|
||||||
|
int eventIndex = key.length - 1; // the event type is always the last key
|
||||||
|
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
|
||||||
|
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
|
||||||
|
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
|
||||||
|
|
||||||
|
return bitSetKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a key set of objects from a combined bitset key.
|
||||||
|
*
|
||||||
|
* Masks out each covariate independently and decodes their values (Object) into a keyset
|
||||||
|
*
|
||||||
|
* @param key the bitset representation of the keys
|
||||||
|
* @return an object array with the values for each key
|
||||||
|
*/
|
||||||
|
public List<Object> keySetFrom(BitSet key) {
|
||||||
|
List<Object> objectKeys = new ArrayList<Object>();
|
||||||
|
for (RequiredCovariateInfo info : requiredCovariates) {
|
||||||
|
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
|
||||||
|
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
|
||||||
|
}
|
||||||
|
|
||||||
|
if (optionalCovariates.size() > 0) {
|
||||||
|
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
|
||||||
|
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits);// mask out the covariate order (to identify which covariate this is)
|
||||||
|
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
|
||||||
|
Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object
|
||||||
|
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
|
||||||
|
objectKeys.add(id); // add the covariate id
|
||||||
|
}
|
||||||
|
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
|
||||||
|
|
||||||
|
return objectKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Translates a masked bitset into a bitset starting at 0
|
||||||
|
*
|
||||||
|
* @param key the masked out bitset
|
||||||
|
* @param n the number of bits to chop
|
||||||
|
* @return a translated bitset starting at 0 for the covariate machinery to decode
|
||||||
|
*/
|
||||||
|
private BitSet chopNBitsFrom(BitSet key, int n) {
|
||||||
|
BitSet choppedKey = new BitSet();
|
||||||
|
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
|
||||||
|
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
|
||||||
|
return choppedKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key
|
||||||
|
*
|
||||||
|
* @param leadingBits the index of the covariate in the ordered covariate list
|
||||||
|
* @param nBits the number of bits needed by the Covariate to represent its values in BitSet form
|
||||||
|
* @return the bitset relevant to the covariate
|
||||||
|
*/
|
||||||
|
|
||||||
|
private BitSet genericMask(int leadingBits, int nBits) {
|
||||||
|
BitSet mask = new BitSet(leadingBits + nBits);
|
||||||
|
mask.set(leadingBits, leadingBits + nBits);
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decodes the event type (enum) from the full bitset key
|
||||||
|
*
|
||||||
|
* @param fullKey the full key of all covariates + event type
|
||||||
|
* @return the decoded event type.
|
||||||
|
*/
|
||||||
|
private EventType eventFromBitSet(BitSet fullKey) {
|
||||||
|
BitSet eventKey = new BitSet();
|
||||||
|
int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits;
|
||||||
|
for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1))
|
||||||
|
eventKey.set(i - firstBitIndex);
|
||||||
|
return EventType.eventFrom(BitSetUtils.shortFrom(eventKey));
|
||||||
|
}
|
||||||
|
|
||||||
|
private BitSet bitSetFromEvent(EventType eventType) {
|
||||||
|
return BitSetUtils.bitSetFrom(eventType.index);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int bitsInEventType() {
|
||||||
|
return BitSetUtils.numberOfBitsToRepresent(EventType.values().length);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
|
||||||
|
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
|
||||||
|
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
|
||||||
|
}
|
||||||
|
|
||||||
|
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
|
||||||
|
BitSet bitSet = (BitSet) key.clone();
|
||||||
|
bitSet.and(mask);
|
||||||
|
return chopNBitsFrom(bitSet, leadingBits);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregate information for each Covariate
|
||||||
|
*/
|
||||||
|
class RequiredCovariateInfo {
|
||||||
|
public int bitsBefore; // number of bits before this covariate in the combined bitset key
|
||||||
|
public int nBits; // number of bits used by this covariate (cached access to covariate.nBits())
|
||||||
|
public BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
|
||||||
|
public Covariate covariate; // this allows reverse lookup of the Covariates in order
|
||||||
|
|
||||||
|
RequiredCovariateInfo(int bitsBefore, int nBits, BitSet mask, Covariate covariate) {
|
||||||
|
this.bitsBefore = bitsBefore;
|
||||||
|
this.nBits = nBits;
|
||||||
|
this.mask = mask;
|
||||||
|
this.covariate = covariate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class OptionalCovariateInfo {
|
||||||
|
public BitSet covariateID; // cache the covariate ID
|
||||||
|
public Covariate covariate;
|
||||||
|
|
||||||
|
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
|
||||||
|
this.covariateID = covariateID;
|
||||||
|
this.covariate = covariate;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -26,7 +26,9 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
|
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||||
|
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
|
@ -43,7 +45,12 @@ public class ContextCovariate implements StandardCovariate {
|
||||||
|
|
||||||
private int mismatchesContextSize;
|
private int mismatchesContextSize;
|
||||||
private int insertionsContextSize;
|
private int insertionsContextSize;
|
||||||
private int deletionsContextSize;
|
private int deletionsContextSize;
|
||||||
|
|
||||||
|
private final BitSet NO_CONTEXT_BITSET = BitSetUtils.bitSetFrom(-1L);
|
||||||
|
// protected final String NO_CONTEXT_VALUE = "N"; // protected so we can UNIT TEST it
|
||||||
|
|
||||||
|
private byte LOW_QUAL_TAIL;
|
||||||
|
|
||||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -52,18 +59,22 @@ public class ContextCovariate implements StandardCovariate {
|
||||||
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
|
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
|
||||||
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
|
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
|
||||||
|
|
||||||
|
LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL;
|
||||||
|
|
||||||
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
|
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
|
||||||
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
|
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
public CovariateValues getValues(GATKSAMRecord read) {
|
||||||
int l = read.getReadLength();
|
int l = read.getReadLength();
|
||||||
BitSet[] mismatches = new BitSet[l];
|
BitSet[] mismatches = new BitSet[l];
|
||||||
BitSet[] insertions = new BitSet[l];
|
BitSet[] insertions = new BitSet[l];
|
||||||
BitSet[] deletions = new BitSet[l];
|
BitSet[] deletions = new BitSet[l];
|
||||||
|
|
||||||
|
read = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
|
||||||
|
|
||||||
final boolean negativeStrand = read.getReadNegativeStrandFlag();
|
final boolean negativeStrand = read.getReadNegativeStrandFlag();
|
||||||
byte[] bases = read.getReadBases();
|
byte[] bases = read.getReadBases();
|
||||||
if (negativeStrand)
|
if (negativeStrand)
|
||||||
|
|
@ -72,7 +83,7 @@ public class ContextCovariate implements StandardCovariate {
|
||||||
for (int i = 0; i < read.getReadLength(); i++) {
|
for (int i = 0; i < read.getReadLength(); i++) {
|
||||||
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
|
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
|
||||||
insertions[i] = contextWith(bases, i, insertionsContextSize);
|
insertions[i] = contextWith(bases, i, insertionsContextSize);
|
||||||
deletions[i] = contextWith(bases, i, deletionsContextSize);
|
deletions[i] = contextWith(bases, i, deletionsContextSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (negativeStrand) {
|
if (negativeStrand) {
|
||||||
|
|
@ -89,24 +100,41 @@ public class ContextCovariate implements StandardCovariate {
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String keyFromBitSet(BitSet key) {
|
||||||
|
if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return BitSetUtils.dnaFrom(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BitSet bitSetFromKey(Object key) {
|
||||||
|
return BitSetUtils.bitSetFrom((String) key);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numberOfBits() {
|
||||||
|
return Long.bitCount(-1L);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* calculates the context of a base independent of the covariate mode
|
* calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion)
|
||||||
*
|
*
|
||||||
* @param bases the bases in the read to build the context from
|
* @param bases the bases in the read to build the context from
|
||||||
* @param offset the position in the read to calculate the context for
|
* @param offset the position in the read to calculate the context for
|
||||||
* @param contextSize context size to use building the context
|
* @param contextSize context size to use building the context
|
||||||
* @return
|
* @return the bitSet representing the Context
|
||||||
*/
|
*/
|
||||||
private BitSet contextWith(byte [] bases, int offset, int contextSize) {
|
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
|
||||||
if (offset < contextSize)
|
BitSet result = null;
|
||||||
return null;
|
if (offset >= contextSize) {
|
||||||
|
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
|
||||||
String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
|
if (!context.contains("N"))
|
||||||
if (context.contains("N"))
|
result = BitSetUtils.bitSetFrom(context);
|
||||||
return null;
|
}
|
||||||
|
return result;
|
||||||
return MathUtils.bitSetFrom(context);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reverses the given array in place.
|
* Reverses the given array in place.
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
*
|
*
|
||||||
|
|
@ -53,7 +55,40 @@ public interface Covariate {
|
||||||
*/
|
*/
|
||||||
public CovariateValues getValues(GATKSAMRecord read);
|
public CovariateValues getValues(GATKSAMRecord read);
|
||||||
|
|
||||||
public Object getValue(String str); // Used to get the covariate's value from input csv file during on-the-fly recalibration
|
/**
|
||||||
|
* Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||||
|
*
|
||||||
|
* @param str the key in string type (read from the csv)
|
||||||
|
* @return the key in it's correct type.
|
||||||
|
*/
|
||||||
|
public Object getValue(String str);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts the bitset representation of the key (used internally for table indexing) to String format for file output.
|
||||||
|
*
|
||||||
|
* @param key the bitset representation of the key
|
||||||
|
* @return a string representation of the key
|
||||||
|
*/
|
||||||
|
public String keyFromBitSet(BitSet key);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a key into a bitset
|
||||||
|
*
|
||||||
|
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates
|
||||||
|
* the getValues method already returns all values in BitSet format.
|
||||||
|
*
|
||||||
|
* @param key the object corresponding to the covariate
|
||||||
|
* @return a bitset representation of the object
|
||||||
|
*/
|
||||||
|
public BitSet bitSetFromKey(Object key);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Each covariate should determine how many bits are necessary to encode it's data
|
||||||
|
*
|
||||||
|
* @return The number of bits used to represent the values of this covariate.
|
||||||
|
*/
|
||||||
|
public int numberOfBits();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RequiredCovariate extends Covariate {}
|
interface RequiredCovariate extends Covariate {}
|
||||||
|
|
|
||||||
|
|
@ -1,88 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The object temporarily held by a read that describes all of it's covariates.
|
|
||||||
*
|
|
||||||
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
|
|
||||||
*
|
|
||||||
* @author Mauricio Carneiro
|
|
||||||
* @since 2/8/12
|
|
||||||
*/
|
|
||||||
public class CovariateKeySet {
|
|
||||||
private Object[][] mismatchesKeySet;
|
|
||||||
private Object[][] insertionsKeySet;
|
|
||||||
private Object[][] deletionsKeySet;
|
|
||||||
|
|
||||||
private int nextCovariateIndex;
|
|
||||||
|
|
||||||
private static String mismatchesCovariateName = "M";
|
|
||||||
private static String insertionsCovariateName = "I";
|
|
||||||
private static String deletionsCovariateName = "D";
|
|
||||||
|
|
||||||
public CovariateKeySet(int readLength, int numberOfCovariates) {
|
|
||||||
numberOfCovariates++; // +1 because we are adding the mismatch covariate (to comply with the molten table format)
|
|
||||||
this.mismatchesKeySet = new Object[readLength][numberOfCovariates];
|
|
||||||
this.insertionsKeySet = new Object[readLength][numberOfCovariates];
|
|
||||||
this.deletionsKeySet = new Object[readLength][numberOfCovariates];
|
|
||||||
initializeCovariateKeySet(this.mismatchesKeySet, mismatchesCovariateName);
|
|
||||||
initializeCovariateKeySet(this.insertionsKeySet, insertionsCovariateName);
|
|
||||||
initializeCovariateKeySet(this.deletionsKeySet, deletionsCovariateName);
|
|
||||||
this.nextCovariateIndex = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addCovariate(CovariateValues covariate) {
|
|
||||||
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
|
|
||||||
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
|
|
||||||
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
|
|
||||||
nextCovariateIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RecalDataManager.BaseRecalibrationType getErrorModelFromString(final String modelString) {
|
|
||||||
if (modelString.equals(mismatchesCovariateName))
|
|
||||||
return RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION;
|
|
||||||
else if (modelString.equals(insertionsCovariateName))
|
|
||||||
return RecalDataManager.BaseRecalibrationType.BASE_INSERTION;
|
|
||||||
else if (modelString.equals(deletionsCovariateName))
|
|
||||||
return RecalDataManager.BaseRecalibrationType.BASE_DELETION;
|
|
||||||
throw new ReviewedStingException("Unrecognized Base Recalibration model string: " + modelString);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object[] getKeySet(final int readPosition, final RecalDataManager.BaseRecalibrationType errorModel) {
|
|
||||||
switch (errorModel) {
|
|
||||||
case BASE_SUBSTITUTION:
|
|
||||||
return getMismatchesKeySet(readPosition);
|
|
||||||
case BASE_INSERTION:
|
|
||||||
return getInsertionsKeySet(readPosition);
|
|
||||||
case BASE_DELETION:
|
|
||||||
return getDeletionsKeySet(readPosition);
|
|
||||||
default:
|
|
||||||
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object[] getMismatchesKeySet(int readPosition) {
|
|
||||||
return mismatchesKeySet[readPosition];
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object[] getInsertionsKeySet(int readPosition) {
|
|
||||||
return insertionsKeySet[readPosition];
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object[] getDeletionsKeySet(int readPosition) {
|
|
||||||
return deletionsKeySet[readPosition];
|
|
||||||
}
|
|
||||||
|
|
||||||
private void transposeCovariateValues (Object [][] keySet, Object [] covariateValues) {
|
|
||||||
for (int i=0; i<covariateValues.length; i++)
|
|
||||||
keySet[i][nextCovariateIndex] = covariateValues[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeCovariateKeySet (Object[][] keySet, String covariateName) {
|
|
||||||
int readLength = keySet.length;
|
|
||||||
int lastCovariateIndex = keySet[0].length - 1;
|
|
||||||
for (int i = 0; i < readLength; i++)
|
|
||||||
keySet[i][lastCovariateIndex] = covariateName;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An object to hold the different covariate values for all bases in the read.
|
* An object to hold the different covariate values for all bases in the read.
|
||||||
*
|
*
|
||||||
|
|
@ -12,25 +14,25 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
* @since 2/8/12
|
* @since 2/8/12
|
||||||
*/
|
*/
|
||||||
public class CovariateValues {
|
public class CovariateValues {
|
||||||
private Object[] mismatches;
|
private BitSet[] mismatches;
|
||||||
private Object[] insertions;
|
private BitSet[] insertions;
|
||||||
private Object[] deletions;
|
private BitSet[] deletions;
|
||||||
|
|
||||||
public CovariateValues(Object[] mismatch, Object[] insertion, Object[] deletion) {
|
public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) {
|
||||||
this.mismatches = mismatch;
|
this.mismatches = mismatch;
|
||||||
this.insertions = insertion;
|
this.insertions = insertion;
|
||||||
this.deletions = deletion;
|
this.deletions = deletion;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object[] getMismatches() {
|
public BitSet[] getMismatches() {
|
||||||
return mismatches;
|
return mismatches;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object[] getInsertions() {
|
public BitSet[] getInsertions() {
|
||||||
return insertions;
|
return insertions;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object[] getDeletions() {
|
public BitSet[] getDeletions() {
|
||||||
return deletions;
|
return deletions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,12 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -59,48 +61,25 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
// Used to pick out the covariate's value from attributes of the read
|
// Used to pick out the covariate's value from attributes of the read
|
||||||
@Override
|
@Override
|
||||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||||
Integer [] cycles = new Integer[read.getReadLength()];
|
BitSet[] cycles = new BitSet[read.getReadLength()];
|
||||||
final NGSPlatform ngsPlatform = read.getNGSPlatform();
|
final NGSPlatform ngsPlatform = read.getNGSPlatform();
|
||||||
|
|
||||||
// Discrete cycle platforms
|
// Discrete cycle platforms
|
||||||
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||||
final int init;
|
final short init;
|
||||||
final int increment;
|
final short increment;
|
||||||
if (!read.getReadNegativeStrandFlag()) {
|
if (!read.getReadNegativeStrandFlag()) {
|
||||||
// Differentiate between first and second of pair.
|
init = 1;
|
||||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
increment = 1;
|
||||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
|
||||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
|
||||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
|
||||||
// the current sequential model would consider the effects independently instead of jointly.
|
|
||||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
|
||||||
//second of pair, positive strand
|
|
||||||
init = -1;
|
|
||||||
increment = -1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
//first of pair, positive strand
|
|
||||||
init = 1;
|
|
||||||
increment = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
init = (short) read.getReadLength();
|
||||||
//second of pair, negative strand
|
increment = -1;
|
||||||
init = -read.getReadLength();
|
|
||||||
increment = 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
//first of pair, negative strand
|
|
||||||
init = read.getReadLength();
|
|
||||||
increment = -1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int cycle = init;
|
short cycle = init;
|
||||||
for (int i = 0; i < read.getReadLength(); i++) {
|
for (int i = 0; i < read.getReadLength(); i++) {
|
||||||
cycles[i] = cycle;
|
cycles[i] = BitSetUtils.bitSetFrom(cycle);
|
||||||
cycle += increment;
|
cycle += increment;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -119,7 +98,7 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
// the current sequential model would consider the effects independently instead of jointly.
|
// the current sequential model would consider the effects independently instead of jointly.
|
||||||
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
|
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
|
||||||
|
|
||||||
int cycle = multiplyByNegative1 ? -1 : 1;
|
short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
|
||||||
|
|
||||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||||
|
|
@ -127,19 +106,19 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
int iii = 0;
|
int iii = 0;
|
||||||
while (iii < readLength) {
|
while (iii < readLength) {
|
||||||
while (iii < readLength && bases[iii] == (byte) 'T') {
|
while (iii < readLength && bases[iii] == (byte) 'T') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii++;
|
iii++;
|
||||||
}
|
}
|
||||||
while (iii < readLength && bases[iii] == (byte) 'A') {
|
while (iii < readLength && bases[iii] == (byte) 'A') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii++;
|
iii++;
|
||||||
}
|
}
|
||||||
while (iii < readLength && bases[iii] == (byte) 'C') {
|
while (iii < readLength && bases[iii] == (byte) 'C') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii++;
|
iii++;
|
||||||
}
|
}
|
||||||
while (iii < readLength && bases[iii] == (byte) 'G') {
|
while (iii < readLength && bases[iii] == (byte) 'G') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii++;
|
iii++;
|
||||||
}
|
}
|
||||||
if (iii < readLength) {
|
if (iii < readLength) {
|
||||||
|
|
@ -149,7 +128,7 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
cycle++;
|
cycle++;
|
||||||
}
|
}
|
||||||
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
|
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii++;
|
iii++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -159,19 +138,19 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
int iii = readLength - 1;
|
int iii = readLength - 1;
|
||||||
while (iii >= 0) {
|
while (iii >= 0) {
|
||||||
while (iii >= 0 && bases[iii] == (byte) 'T') {
|
while (iii >= 0 && bases[iii] == (byte) 'T') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii--;
|
iii--;
|
||||||
}
|
}
|
||||||
while (iii >= 0 && bases[iii] == (byte) 'A') {
|
while (iii >= 0 && bases[iii] == (byte) 'A') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii--;
|
iii--;
|
||||||
}
|
}
|
||||||
while (iii >= 0 && bases[iii] == (byte) 'C') {
|
while (iii >= 0 && bases[iii] == (byte) 'C') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii--;
|
iii--;
|
||||||
}
|
}
|
||||||
while (iii >= 0 && bases[iii] == (byte) 'G') {
|
while (iii >= 0 && bases[iii] == (byte) 'G') {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii--;
|
iii--;
|
||||||
}
|
}
|
||||||
if (iii >= 0) {
|
if (iii >= 0) {
|
||||||
|
|
@ -181,7 +160,7 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
cycle++;
|
cycle++;
|
||||||
}
|
}
|
||||||
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
|
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
|
||||||
cycles[iii] = cycle;
|
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
|
||||||
iii--;
|
iii--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -192,13 +171,28 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
else {
|
else {
|
||||||
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
|
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
|
||||||
}
|
}
|
||||||
|
|
||||||
return new CovariateValues(cycles, cycles, cycles);
|
return new CovariateValues(cycles, cycles, cycles);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||||
@Override
|
@Override
|
||||||
public final Object getValue(final String str) {
|
public final Object getValue(final String str) {
|
||||||
return Integer.parseInt(str);
|
return Short.parseShort(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String keyFromBitSet(BitSet key) {
|
||||||
|
return String.format("%d", BitSetUtils.shortFrom(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BitSet bitSetFromKey(Object key) {
|
||||||
|
return BitSetUtils.bitSetFrom((Short) key);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numberOfBits() {
|
||||||
|
return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
public enum EventType {
|
||||||
|
BASE_SUBSTITUTION(0, "M"),
|
||||||
|
BASE_INSERTION(1, "I"),
|
||||||
|
BASE_DELETION(2, "D");
|
||||||
|
|
||||||
|
public int index;
|
||||||
|
public String representation;
|
||||||
|
|
||||||
|
private EventType(int index, String representation) {
|
||||||
|
this.index = index;
|
||||||
|
this.representation = representation;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static EventType eventFrom(int index) {
|
||||||
|
switch (index) {
|
||||||
|
case 0:
|
||||||
|
return BASE_SUBSTITUTION;
|
||||||
|
case 1:
|
||||||
|
return BASE_INSERTION;
|
||||||
|
case 2:
|
||||||
|
return BASE_DELETION;
|
||||||
|
default:
|
||||||
|
throw new ReviewedStingException(String.format("Event %d does not exist.", index));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static EventType eventFrom(String event) {
|
||||||
|
for (EventType eventType : EventType.values())
|
||||||
|
if (eventType.representation.equals(event))
|
||||||
|
return eventType;
|
||||||
|
|
||||||
|
throw new ReviewedStingException(String.format("Event %s does not exist.", event));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return representation;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,7 +1,11 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
*
|
*
|
||||||
|
|
@ -46,18 +50,18 @@ public class QualityScoreCovariate implements RequiredCovariate {
|
||||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||||
int readLength = read.getReadLength();
|
int readLength = read.getReadLength();
|
||||||
|
|
||||||
Integer [] mismatches = new Integer[readLength];
|
BitSet[] mismatches = new BitSet[readLength];
|
||||||
Integer [] insertions = new Integer[readLength];
|
BitSet[] insertions = new BitSet[readLength];
|
||||||
Integer [] deletions = new Integer[readLength];
|
BitSet[] deletions = new BitSet[readLength];
|
||||||
|
|
||||||
byte [] baseQualities = read.getBaseQualities();
|
byte[] baseQualities = read.getBaseQualities();
|
||||||
byte [] baseInsertionQualities = read.getBaseInsertionQualities();
|
byte[] baseInsertionQualities = read.getBaseInsertionQualities();
|
||||||
byte [] baseDeletionQualities = read.getBaseDeletionQualities();
|
byte[] baseDeletionQualities = read.getBaseDeletionQualities();
|
||||||
|
|
||||||
for (int i=0; i<baseQualities.length; i++) {
|
for (int i = 0; i < baseQualities.length; i++) {
|
||||||
mismatches[i] = (int) baseQualities[i];
|
mismatches[i] = BitSetUtils.bitSetFrom(baseQualities[i]);
|
||||||
insertions[i] = (int) baseInsertionQualities[i];
|
insertions[i] = BitSetUtils.bitSetFrom(baseInsertionQualities[i]);
|
||||||
deletions[i] = (int) baseDeletionQualities[i];
|
deletions[i] = BitSetUtils.bitSetFrom(baseDeletionQualities[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new CovariateValues(mismatches, insertions, deletions);
|
return new CovariateValues(mismatches, insertions, deletions);
|
||||||
|
|
@ -66,6 +70,21 @@ public class QualityScoreCovariate implements RequiredCovariate {
|
||||||
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
// Used to get the covariate's value from input csv file during on-the-fly recalibration
|
||||||
@Override
|
@Override
|
||||||
public final Object getValue(final String str) {
|
public final Object getValue(final String str) {
|
||||||
return Integer.parseInt(str);
|
return Byte.parseByte(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String keyFromBitSet(BitSet key) {
|
||||||
|
return String.format("%d", BitSetUtils.longFrom(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BitSet bitSetFromKey(Object key) {
|
||||||
|
return BitSetUtils.bitSetFrom((Byte) key);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numberOfBits() {
|
||||||
|
return BitSetUtils.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The object temporarily held by a read that describes all of it's covariates.
|
||||||
|
*
|
||||||
|
* In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
|
||||||
|
*
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 2/8/12
|
||||||
|
*/
|
||||||
|
public class ReadCovariates {
|
||||||
|
private BitSet[][] mismatchesKeySet;
|
||||||
|
private BitSet[][] insertionsKeySet;
|
||||||
|
private BitSet[][] deletionsKeySet;
|
||||||
|
|
||||||
|
private int nextCovariateIndex;
|
||||||
|
|
||||||
|
public ReadCovariates(int readLength, int numberOfCovariates) {
|
||||||
|
this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates];
|
||||||
|
this.insertionsKeySet = new BitSet[readLength][numberOfCovariates];
|
||||||
|
this.deletionsKeySet = new BitSet[readLength][numberOfCovariates];
|
||||||
|
this.nextCovariateIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addCovariate(CovariateValues covariate) {
|
||||||
|
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
|
||||||
|
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
|
||||||
|
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
|
||||||
|
nextCovariateIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BitSet[] getKeySet(final int readPosition, final EventType errorModel) {
|
||||||
|
switch (errorModel) {
|
||||||
|
case BASE_SUBSTITUTION:
|
||||||
|
return getMismatchesKeySet(readPosition);
|
||||||
|
case BASE_INSERTION:
|
||||||
|
return getInsertionsKeySet(readPosition);
|
||||||
|
case BASE_DELETION:
|
||||||
|
return getDeletionsKeySet(readPosition);
|
||||||
|
default:
|
||||||
|
throw new ReviewedStingException("Unrecognized Base Recalibration type: " + errorModel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public BitSet[] getMismatchesKeySet(int readPosition) {
|
||||||
|
return mismatchesKeySet[readPosition];
|
||||||
|
}
|
||||||
|
|
||||||
|
public BitSet[] getInsertionsKeySet(int readPosition) {
|
||||||
|
return insertionsKeySet[readPosition];
|
||||||
|
}
|
||||||
|
|
||||||
|
public BitSet[] getDeletionsKeySet(int readPosition) {
|
||||||
|
return deletionsKeySet[readPosition];
|
||||||
|
}
|
||||||
|
|
||||||
|
private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) {
|
||||||
|
for (int i = 0; i < covariateValues.length; i++)
|
||||||
|
keySet[i][nextCovariateIndex] = covariateValues[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.BitSet;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -39,7 +41,7 @@ import java.util.HashMap;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ReadGroupCovariate implements RequiredCovariate {
|
public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
|
|
||||||
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
|
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
|
||||||
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
|
||||||
private short nextId = 0;
|
private short nextId = 0;
|
||||||
|
|
@ -53,17 +55,9 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
public CovariateValues getValues(final GATKSAMRecord read) {
|
public CovariateValues getValues(final GATKSAMRecord read) {
|
||||||
final int l = read.getReadLength();
|
final int l = read.getReadLength();
|
||||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
final String readGroupId = read.getReadGroup().getReadGroupId();
|
||||||
short shortId;
|
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
|
||||||
if (readGroupLookupTable.containsKey(readGroupId))
|
BitSet[] readGroups = new BitSet[l];
|
||||||
shortId = readGroupLookupTable.get(readGroupId);
|
Arrays.fill(readGroups, rg);
|
||||||
else {
|
|
||||||
shortId = nextId;
|
|
||||||
readGroupLookupTable.put(readGroupId, nextId);
|
|
||||||
readGroupReverseLookupTable.put(nextId, readGroupId);
|
|
||||||
nextId++;
|
|
||||||
}
|
|
||||||
Short [] readGroups = new Short[l];
|
|
||||||
Arrays.fill(readGroups, shortId);
|
|
||||||
return new CovariateValues(readGroups, readGroups, readGroups);
|
return new CovariateValues(readGroups, readGroups, readGroups);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -72,10 +66,38 @@ public class ReadGroupCovariate implements RequiredCovariate {
|
||||||
public final Object getValue(final String str) {
|
public final Object getValue(final String str) {
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String keyFromBitSet(BitSet key) {
|
||||||
|
return decodeReadGroup((short) BitSetUtils.longFrom(key));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BitSet bitSetFromKey(Object key) {
|
||||||
|
return bitSetForReadGroup((String) key);
|
||||||
|
}
|
||||||
|
|
||||||
public final String decodeReadGroup(final short id) {
|
public final String decodeReadGroup(final short id) {
|
||||||
return readGroupReverseLookupTable.get(id);
|
return readGroupReverseLookupTable.get(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numberOfBits() {
|
||||||
|
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private BitSet bitSetForReadGroup(String readGroupId) {
|
||||||
|
short shortId;
|
||||||
|
if (readGroupLookupTable.containsKey(readGroupId))
|
||||||
|
shortId = readGroupLookupTable.get(readGroupId);
|
||||||
|
else {
|
||||||
|
shortId = nextId;
|
||||||
|
readGroupLookupTable.put(readGroupId, nextId);
|
||||||
|
readGroupReverseLookupTable.put(nextId, readGroupId);
|
||||||
|
nextId++;
|
||||||
|
}
|
||||||
|
return BitSetUtils.bitSetFrom(shortId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -53,24 +53,18 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class RecalDataManager {
|
public class RecalDataManager {
|
||||||
public final NestedHashMap nestedHashMap; // The full dataset
|
public final NestedHashMap nestedHashMap; // The full dataset
|
||||||
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
private final HashMap<EventType, NestedHashMap> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||||
private final HashMap<BaseRecalibrationType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
private final HashMap<EventType, NestedHashMap> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||||
private final HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
private final HashMap<EventType, ArrayList<NestedHashMap>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||||
|
|
||||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||||
private static boolean warnUserNullPlatform = false;
|
private static boolean warnUserNullPlatform = false;
|
||||||
|
|
||||||
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
private static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\
|
||||||
|
|
||||||
public enum BaseRecalibrationType {
|
|
||||||
BASE_SUBSTITUTION,
|
|
||||||
BASE_INSERTION,
|
|
||||||
BASE_DELETION
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum SOLID_RECAL_MODE {
|
public enum SOLID_RECAL_MODE {
|
||||||
/**
|
/**
|
||||||
|
|
@ -116,10 +110,10 @@ public class RecalDataManager {
|
||||||
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
|
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
|
||||||
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
|
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by on-the-fly recalibration
|
||||||
nestedHashMap = null;
|
nestedHashMap = null;
|
||||||
dataCollapsedReadGroup = new HashMap<BaseRecalibrationType, NestedHashMap>();
|
dataCollapsedReadGroup = new HashMap<EventType, NestedHashMap>();
|
||||||
dataCollapsedQualityScore = new HashMap<BaseRecalibrationType, NestedHashMap>();
|
dataCollapsedQualityScore = new HashMap<EventType, NestedHashMap>();
|
||||||
dataCollapsedByCovariate = new HashMap<BaseRecalibrationType, ArrayList<NestedHashMap>>();
|
dataCollapsedByCovariate = new HashMap<EventType, ArrayList<NestedHashMap>>();
|
||||||
for ( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
|
for (final EventType errorModel : EventType.values()) {
|
||||||
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
|
dataCollapsedReadGroup.put(errorModel, new NestedHashMap());
|
||||||
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
|
dataCollapsedQualityScore.put(errorModel, new NestedHashMap());
|
||||||
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
|
dataCollapsedByCovariate.put(errorModel, new ArrayList<NestedHashMap>());
|
||||||
|
|
@ -136,100 +130,10 @@ public class RecalDataManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static CovariateKeySet getAllCovariateValuesFor(GATKSAMRecord read) {
|
public static ReadCovariates covariateKeySetFrom(GATKSAMRecord read) {
|
||||||
return (CovariateKeySet) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
return (ReadCovariates) read.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add the given mapping to all of the collapsed hash tables
|
|
||||||
*
|
|
||||||
* @param key The list of comparables that is the key for this mapping
|
|
||||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
|
||||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
|
||||||
*/
|
|
||||||
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN, final BaseRecalibrationType errorModel ) {
|
|
||||||
|
|
||||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
|
||||||
//data.put(key, thisDatum); // add the mapping to the main table
|
|
||||||
|
|
||||||
final int qualityScore = Integer.parseInt(key[1].toString());
|
|
||||||
final Object[] readGroupCollapsedKey = new Object[1];
|
|
||||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
|
||||||
final Object[] covariateCollapsedKey = new Object[3];
|
|
||||||
RecalDatum collapsedDatum;
|
|
||||||
|
|
||||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
|
||||||
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
|
|
||||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
|
||||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(errorModel).get(readGroupCollapsedKey);
|
|
||||||
if (collapsedDatum == null) {
|
|
||||||
dataCollapsedReadGroup.get(errorModel).put(new RecalDatum(fullDatum), readGroupCollapsedKey);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
|
||||||
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
|
||||||
qualityScoreCollapsedKey[1] = key[1]; // and quality score
|
|
||||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(errorModel).get(qualityScoreCollapsedKey);
|
|
||||||
if (collapsedDatum == null) {
|
|
||||||
dataCollapsedQualityScore.get(errorModel).put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
collapsedDatum.increment(fullDatum);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
|
||||||
for (int iii = 0; iii < dataCollapsedByCovariate.get(errorModel).size(); iii++) {
|
|
||||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
|
||||||
covariateCollapsedKey[1] = key[1]; // and quality score ...
|
|
||||||
final Object theCovariateElement = key[iii + 2]; // and the given covariate
|
|
||||||
if (theCovariateElement != null) {
|
|
||||||
covariateCollapsedKey[2] = theCovariateElement;
|
|
||||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(errorModel).get(iii).get(covariateCollapsedKey);
|
|
||||||
if (collapsedDatum == null) {
|
|
||||||
dataCollapsedByCovariate.get(errorModel).get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
collapsedDatum.increment(fullDatum);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
|
||||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
|
||||||
*
|
|
||||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
|
||||||
* @param maxQual At which value to cap the quality scores
|
|
||||||
*/
|
|
||||||
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
|
|
||||||
|
|
||||||
for( final BaseRecalibrationType errorModel : BaseRecalibrationType.values() ) {
|
|
||||||
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.get(errorModel).data, smoothing, maxQual);
|
|
||||||
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.get(errorModel).data, smoothing, maxQual);
|
|
||||||
for (NestedHashMap map : dataCollapsedByCovariate.get(errorModel)) {
|
|
||||||
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
|
|
||||||
checkForSingletons(map.data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
|
|
||||||
|
|
||||||
for (Object comp : data.keySet()) {
|
|
||||||
final Object val = data.get(comp);
|
|
||||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
|
||||||
((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
|
|
||||||
}
|
|
||||||
else { // Another layer in the nested hash map
|
|
||||||
recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void checkForSingletons(final Map data) {
|
private void checkForSingletons(final Map data) {
|
||||||
// todo -- this looks like it's better just as a data.valueSet() call?
|
// todo -- this looks like it's better just as a data.valueSet() call?
|
||||||
|
|
@ -253,7 +157,7 @@ public class RecalDataManager {
|
||||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||||
* @return The desired collapsed HashMap
|
* @return The desired collapsed HashMap
|
||||||
*/
|
*/
|
||||||
public final NestedHashMap getCollapsedTable(final int covariate, final BaseRecalibrationType errorModel) {
|
public final NestedHashMap getCollapsedTable(final int covariate, final EventType errorModel) {
|
||||||
if (covariate == 0) {
|
if (covariate == 0) {
|
||||||
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
|
return dataCollapsedReadGroup.get(errorModel); // Table where everything except read group has been collapsed
|
||||||
}
|
}
|
||||||
|
|
@ -551,6 +455,7 @@ public class RecalDataManager {
|
||||||
/**
|
/**
|
||||||
* Given the base and the color calculate the next base in the sequence
|
* Given the base and the color calculate the next base in the sequence
|
||||||
*
|
*
|
||||||
|
* @param read the read
|
||||||
* @param prevBase The base
|
* @param prevBase The base
|
||||||
* @param color The color
|
* @param color The color
|
||||||
* @return The next base in the sequence
|
* @return The next base in the sequence
|
||||||
|
|
@ -615,22 +520,23 @@ public class RecalDataManager {
|
||||||
* Computes all requested covariates for every offset in the given read
|
* Computes all requested covariates for every offset in the given read
|
||||||
* by calling covariate.getValues(..).
|
* by calling covariate.getValues(..).
|
||||||
*
|
*
|
||||||
|
* It populates an array of covariate values where result[i][j] is the covariate
|
||||||
|
* value for the ith position in the read and the jth covariate in
|
||||||
|
* reqeustedCovariates list.
|
||||||
|
*
|
||||||
* @param read The read for which to compute covariate values.
|
* @param read The read for which to compute covariate values.
|
||||||
* @param requestedCovariates The list of requested covariates.
|
* @param requestedCovariates The list of requested covariates.
|
||||||
* @return An array of covariate values where result[i][j] is the covariate
|
|
||||||
* value for the ith position in the read and the jth covariate in
|
|
||||||
* reqeustedCovariates list.
|
|
||||||
*/
|
*/
|
||||||
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
|
public static void computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
|
||||||
final int numRequestedCovariates = requestedCovariates.size();
|
final int numRequestedCovariates = requestedCovariates.size();
|
||||||
final int readLength = read.getReadLength();
|
final int readLength = read.getReadLength();
|
||||||
final CovariateKeySet covariateKeySet = new CovariateKeySet(readLength, numRequestedCovariates);
|
final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates);
|
||||||
|
|
||||||
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||||
for (Covariate covariate : requestedCovariates)
|
for (Covariate covariate : requestedCovariates)
|
||||||
covariateKeySet.addCovariate(covariate.getValues(read));
|
readCovariates.addCovariate(covariate.getValues(read));
|
||||||
|
|
||||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, covariateKeySet);
|
read.setTemporaryAttribute(COVARS_ATTRIBUTE, readCovariates);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,7 @@ public class RecalDatumOptimized {
|
||||||
public final double empiricalQualDouble(final int smoothing, final double maxQual) {
|
public final double empiricalQualDouble(final int smoothing, final double maxQual) {
|
||||||
final double doubleMismatches = (double) (numMismatches + smoothing);
|
final double doubleMismatches = (double) (numMismatches + smoothing);
|
||||||
final double doubleObservations = (double) (numObservations + smoothing);
|
final double doubleObservations = (double) (numObservations + smoothing);
|
||||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||||
return Math.min(empiricalQual, maxQual);
|
return Math.min(empiricalQual, maxQual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -106,9 +106,10 @@ public class RecalDatumOptimized {
|
||||||
|
|
||||||
public final byte empiricalQualByte() {
|
public final byte empiricalQualByte() {
|
||||||
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
|
return empiricalQualByte(0); // 'default' behavior is to use smoothing value of zero
|
||||||
}
|
}
|
||||||
|
|
||||||
public final String outputToCSV() {
|
@Override
|
||||||
|
public final String toString() {
|
||||||
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
|
return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,10 +27,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broad.tribble.Feature;
|
import org.broad.tribble.Feature;
|
||||||
import org.broadinstitute.sting.commandline.*;
|
import org.broadinstitute.sting.commandline.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
|
@ -60,7 +58,7 @@ public class RecalibrationArgumentCollection {
|
||||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||||
*/
|
*/
|
||||||
@Gather(CountCovariatesGatherer.class)
|
@Gather(BQSRGatherer.class)
|
||||||
@Output
|
@Output
|
||||||
protected PrintStream RECAL_FILE;
|
protected PrintStream RECAL_FILE;
|
||||||
|
|
||||||
|
|
@ -92,16 +90,6 @@ public class RecalibrationArgumentCollection {
|
||||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||||
protected boolean RUN_WITHOUT_DBSNP = false;
|
protected boolean RUN_WITHOUT_DBSNP = false;
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// protected Member Variables
|
|
||||||
/////////////////////////////
|
|
||||||
protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
|
|
||||||
protected final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>();// A list to hold the covariate objects that were requested
|
|
||||||
|
|
||||||
protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
|
|
||||||
protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||||
* reads which have had the reference inserted because of color space inconsistencies.
|
* reads which have had the reference inserted because of color space inconsistencies.
|
||||||
|
|
@ -153,6 +141,10 @@ public class RecalibrationArgumentCollection {
|
||||||
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
|
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
|
||||||
public byte DELETIONS_DEFAULT_QUALITY = 45;
|
public byte DELETIONS_DEFAULT_QUALITY = 45;
|
||||||
|
|
||||||
|
@Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false)
|
||||||
|
public byte LOW_QUAL_TAIL = 2;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2011, The Broad Institute
|
* Copyright (c) 2012, The Broad Institute
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
* obtaining a copy of this software and associated documentation
|
* obtaining a copy of this software and associated documentation
|
||||||
|
|
@ -244,7 +244,8 @@ public class DiffEngine {
|
||||||
table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount());
|
table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount());
|
||||||
table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString());
|
table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString());
|
||||||
}
|
}
|
||||||
table.write(params.out);
|
GATKReport output = new GATKReport(table);
|
||||||
|
output.print(params.out);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) {
|
protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) {
|
||||||
|
|
|
||||||
|
|
@ -68,8 +68,8 @@ public class VCFDiffableReader implements DiffableReader {
|
||||||
VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader);
|
VCFHeader header = (VCFHeader)vcfCodec.readHeader(lineReader);
|
||||||
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
|
for ( VCFHeaderLine headerLine : header.getMetaData() ) {
|
||||||
String key = headerLine.getKey();
|
String key = headerLine.getKey();
|
||||||
if ( headerLine instanceof VCFNamedHeaderLine )
|
if ( headerLine instanceof VCFIDHeaderLine)
|
||||||
key += "_" + ((VCFNamedHeaderLine) headerLine).getName();
|
key += "_" + ((VCFIDHeaderLine) headerLine).getID();
|
||||||
if ( root.hasElement(key) )
|
if ( root.hasElement(key) )
|
||||||
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
|
logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString());
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,6 @@ public enum DiploidGenotype {
|
||||||
return r != base2;
|
return r != base2;
|
||||||
else
|
else
|
||||||
return base2 == r;
|
return base2 == r;
|
||||||
//return MathUtils.countOccurrences(r, this.toString()) == 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isHom() {
|
public boolean isHom() {
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
@ -61,7 +60,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false);
|
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
|
||||||
|
|
||||||
return alleles;
|
return alleles;
|
||||||
}
|
}
|
||||||
|
|
@ -85,21 +84,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||||
|
|
||||||
// make sure that we've cached enough data
|
|
||||||
if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 )
|
|
||||||
UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles);
|
|
||||||
|
|
||||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||||
for ( final double[] likelihoods : GLs ) {
|
for ( final double[] likelihoods : GLs ) {
|
||||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL];
|
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
|
||||||
if ( alleles[0] != 0 )
|
if ( alleles.alleleIndex1 != 0 )
|
||||||
likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||||
// don't double-count it
|
// don't double-count it
|
||||||
if ( alleles[1] != 0 && alleles[1] != alleles[0] )
|
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||||
likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -189,24 +184,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
// the column of the matrix
|
// the column of the matrix
|
||||||
final double[] log10Likelihoods;
|
final double[] log10Likelihoods;
|
||||||
|
|
||||||
// mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column;
|
int sum = -1;
|
||||||
// for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB.
|
|
||||||
final HashMap<ExactACcounts, Integer> ACsetIndexToPLIndex = new HashMap<ExactACcounts, Integer>();
|
|
||||||
|
|
||||||
// to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them
|
|
||||||
final ArrayList<ExactACcounts> dependentACsetsToDelete = new ArrayList<ExactACcounts>();
|
|
||||||
|
|
||||||
|
|
||||||
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
||||||
this.ACcounts = ACcounts;
|
this.ACcounts = ACcounts;
|
||||||
log10Likelihoods = new double[size];
|
log10Likelihoods = new double[size];
|
||||||
|
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum of all the non-reference alleles
|
// sum of all the non-reference alleles
|
||||||
public int getACsum() {
|
public int getACsum() {
|
||||||
int sum = 0;
|
if ( sum == -1 ) {
|
||||||
for ( int count : ACcounts.getCounts() )
|
sum = 0;
|
||||||
sum += count;
|
for ( int count : ACcounts.getCounts() )
|
||||||
|
sum += count;
|
||||||
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -215,15 +207,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO -- remove me
|
||||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||||
final int numAlternateAlleles,
|
final int numAlternateAlleles,
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result,
|
final AlleleFrequencyCalculationResult result,
|
||||||
final boolean preserveData) {
|
final boolean foo) {
|
||||||
|
linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result);
|
||||||
|
}
|
||||||
|
|
||||||
// make sure the PL cache has been initialized
|
|
||||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
|
|
||||||
UnifiedGenotyperEngine.calculatePLcache(5);
|
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||||
|
final int numAlternateAlleles,
|
||||||
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
|
final AlleleFrequencyCalculationResult result) {
|
||||||
|
|
||||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||||
final int numSamples = genotypeLikelihoods.size()-1;
|
final int numSamples = genotypeLikelihoods.size()-1;
|
||||||
|
|
@ -241,21 +239,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
ACqueue.add(zeroSet);
|
ACqueue.add(zeroSet);
|
||||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||||
|
|
||||||
// optimization: create the temporary storage for computing L(j,k) just once
|
|
||||||
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
|
|
||||||
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
|
|
||||||
for ( int i = 0; i < maxPossibleDependencies; i++ )
|
|
||||||
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
|
|
||||||
|
|
||||||
// keep processing while we have AC conformations that need to be calculated
|
// keep processing while we have AC conformations that need to be calculated
|
||||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||||
while ( !ACqueue.isEmpty() ) {
|
while ( !ACqueue.isEmpty() ) {
|
||||||
// compute log10Likelihoods
|
// compute log10Likelihoods
|
||||||
final ExactACset set = ACqueue.remove();
|
final ExactACset set = ACqueue.remove();
|
||||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||||
|
|
||||||
// adjust max likelihood seen if needed
|
// adjust max likelihood seen if needed
|
||||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||||
|
|
||||||
|
// clean up memory
|
||||||
|
indexesToACset.remove(set.ACcounts);
|
||||||
|
//if ( DEBUG )
|
||||||
|
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -273,27 +270,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
final ArrayList<double[]> genotypeLikelihoods,
|
final ArrayList<double[]> genotypeLikelihoods,
|
||||||
final double maxLog10L,
|
final double maxLog10L,
|
||||||
final int numChr,
|
final int numChr,
|
||||||
final boolean preserveData,
|
|
||||||
final LinkedList<ExactACset> ACqueue,
|
final LinkedList<ExactACset> ACqueue,
|
||||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result,
|
final AlleleFrequencyCalculationResult result) {
|
||||||
final double[][] tempLog10ConformationLikelihoods) {
|
|
||||||
|
|
||||||
//if ( DEBUG )
|
//if ( DEBUG )
|
||||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||||
|
|
||||||
// compute the log10Likelihoods
|
// compute the log10Likelihoods
|
||||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
|
||||||
|
|
||||||
// clean up memory
|
|
||||||
if ( !preserveData ) {
|
|
||||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
|
||||||
indexesToACset.remove(index);
|
|
||||||
//if ( DEBUG )
|
|
||||||
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||||
|
|
||||||
|
|
@ -301,11 +287,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||||
//if ( DEBUG )
|
//if ( DEBUG )
|
||||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||||
|
|
||||||
// no reason to keep this data around because nothing depends on it
|
|
||||||
if ( !preserveData )
|
|
||||||
indexesToACset.remove(set.ACcounts);
|
|
||||||
|
|
||||||
return log10LofK;
|
return log10LofK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -316,15 +297,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||||
|
|
||||||
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
|
|
||||||
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
|
|
||||||
|
|
||||||
// add conformations for the k+1 case
|
// add conformations for the k+1 case
|
||||||
int PLindex = 0;
|
|
||||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||||
ACcountsClone[allele]++;
|
ACcountsClone[allele]++;
|
||||||
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||||
|
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||||
|
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||||
|
|
@ -338,71 +317,51 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
ACcountsClone[allele_i]++;
|
ACcountsClone[allele_i]++;
|
||||||
ACcountsClone[allele_j]++;
|
ACcountsClone[allele_j]++;
|
||||||
|
|
||||||
|
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
|
||||||
|
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
|
||||||
if ( allele_i == allele_j )
|
if ( allele_i == allele_j )
|
||||||
sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||||
else
|
else
|
||||||
differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex));
|
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||||
for ( DependentSet dependent : differentAlleles )
|
for ( DependentSet dependent : differentAlleles )
|
||||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||||
for ( DependentSet dependent : sameAlleles )
|
for ( DependentSet dependent : sameAlleles )
|
||||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||||
}
|
|
||||||
|
|
||||||
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
|
|
||||||
if ( !preserveData ) {
|
|
||||||
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
|
||||||
if ( lastSet != null )
|
|
||||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return log10LofK;
|
return log10LofK;
|
||||||
}
|
}
|
||||||
|
|
||||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||||
// also adds it as a dependency to the given callingSetIndex.
|
// also pushes its value to the given callingSetIndex.
|
||||||
// returns the ExactACset if that set was not already in the queue and null otherwise.
|
private static void updateACset(final int[] newSetCounts,
|
||||||
private static void updateACset(final int[] ACcounts,
|
|
||||||
final int numChr,
|
final int numChr,
|
||||||
final ExactACset callingSet,
|
final ExactACset dependentSet,
|
||||||
final int PLsetIndex,
|
final int PLsetIndex,
|
||||||
final Queue<ExactACset> ACqueue,
|
final Queue<ExactACset> ACqueue,
|
||||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||||
final ExactACcounts index = new ExactACcounts(ACcounts);
|
final ArrayList<double[]> genotypeLikelihoods) {
|
||||||
|
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||||
if ( !indexesToACset.containsKey(index) ) {
|
if ( !indexesToACset.containsKey(index) ) {
|
||||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||||
indexesToACset.put(index, set);
|
indexesToACset.put(index, set);
|
||||||
ACqueue.add(set);
|
ACqueue.add(set);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add the given dependency to the set
|
// push data from the dependency to the new set
|
||||||
//if ( DEBUG )
|
//if ( DEBUG )
|
||||||
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
|
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||||
final ExactACset set = indexesToACset.get(index);
|
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||||
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
|
|
||||||
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
|
|
||||||
while ( reverseIterator.hasNext() ) {
|
|
||||||
final ExactACset queued = reverseIterator.next();
|
|
||||||
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
|
|
||||||
return queued;
|
|
||||||
}
|
|
||||||
|
|
||||||
// shouldn't get here
|
|
||||||
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void computeLofK(final ExactACset set,
|
private static void computeLofK(final ExactACset set,
|
||||||
final ArrayList<double[]> genotypeLikelihoods,
|
final ArrayList<double[]> genotypeLikelihoods,
|
||||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result,
|
final AlleleFrequencyCalculationResult result) {
|
||||||
final double[][] tempLog10ConformationLikelihoods) {
|
|
||||||
|
|
||||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||||
final int totalK = set.getACsum();
|
final int totalK = set.getACsum();
|
||||||
|
|
@ -414,42 +373,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
// k > 0 for at least one k
|
// k > 0 for at least one k
|
||||||
else {
|
else {
|
||||||
// deal with the non-AA possible conformations
|
// the non-AA possible conformations were dealt with by pushes from dependent sets;
|
||||||
int conformationIndex = 1;
|
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
|
||||||
//if ( DEBUG )
|
|
||||||
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
|
||||||
|
|
||||||
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
|
||||||
|
|
||||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
|
||||||
|
|
||||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
|
||||||
final double[] gl = genotypeLikelihoods.get(j);
|
|
||||||
tempLog10ConformationLikelihoods[j][conformationIndex] =
|
|
||||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
|
||||||
} else {
|
|
||||||
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
conformationIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
|
||||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
|
||||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||||
|
|
||||||
if ( totalK < 2*j-1 ) {
|
if ( totalK < 2*j-1 ) {
|
||||||
final double[] gl = genotypeLikelihoods.get(j);
|
final double[] gl = genotypeLikelihoods.get(j);
|
||||||
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||||
} else {
|
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
|
||||||
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||||
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
|
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
|
||||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -478,6 +413,23 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void pushData(final ExactACset targetSet,
|
||||||
|
final ExactACset dependentSet,
|
||||||
|
final int PLsetIndex,
|
||||||
|
final ArrayList<double[]> genotypeLikelihoods) {
|
||||||
|
final int totalK = targetSet.getACsum();
|
||||||
|
|
||||||
|
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
|
||||||
|
|
||||||
|
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||||
|
final double[] gl = genotypeLikelihoods.get(j);
|
||||||
|
final double conformationValue =
|
||||||
|
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
|
||||||
|
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||||
|
|
||||||
// the closed form representation generalized for multiple alleles is as follows:
|
// the closed form representation generalized for multiple alleles is as follows:
|
||||||
|
|
@ -488,25 +440,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
// BC: 2 * k_b * k_c
|
// BC: 2 * k_b * k_c
|
||||||
// CC: k_c * (k_c - 1)
|
// CC: k_c * (k_c - 1)
|
||||||
|
|
||||||
final int numAltAlleles = ACcounts.length;
|
// find the 2 alleles that are represented by this PL index
|
||||||
|
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||||
|
|
||||||
|
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
|
||||||
|
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
|
||||||
|
|
||||||
// the AX het case
|
// the AX het case
|
||||||
if ( PLindex <= numAltAlleles )
|
if ( alleles.alleleIndex1 == 0 )
|
||||||
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
|
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||||
|
|
||||||
// find the 2 alternate alleles that are represented by this PL index
|
final int k_i = ACcounts[alleles.alleleIndex1-1];
|
||||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
|
|
||||||
|
|
||||||
final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
|
|
||||||
|
|
||||||
// the hom var case (e.g. BB, CC, DD)
|
// the hom var case (e.g. BB, CC, DD)
|
||||||
final double coeff;
|
final double coeff;
|
||||||
if ( alleles[0] == alleles[1] ) {
|
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
|
||||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||||
}
|
}
|
||||||
// the het non-ref case (e.g. BC, BD, CD)
|
// the het non-ref case (e.g. BC, BD, CD)
|
||||||
else {
|
else {
|
||||||
final int k_j = ACcounts[alleles[1]-1];
|
final int k_j = ACcounts[alleles.alleleIndex2-1];
|
||||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -240,6 +240,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
||||||
refAllele = Allele.create(refBases, true);
|
refAllele = Allele.create(refBases, true);
|
||||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||||
}
|
}
|
||||||
|
else continue; // don't go on with this allele if refBases are non-standard
|
||||||
} else {
|
} else {
|
||||||
// insertion case
|
// insertion case
|
||||||
if (Allele.acceptableAlleleBases(s)) {
|
if (Allele.acceptableAlleleBases(s)) {
|
||||||
|
|
@ -247,6 +248,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
||||||
altAllele = Allele.create(s, false);
|
altAllele = Allele.create(s, false);
|
||||||
stop = loc.getStart();
|
stop = loc.getStart();
|
||||||
}
|
}
|
||||||
|
else continue; // go on to next allele if consensus insertion has any non-standard base.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
|
@ -53,10 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
||||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||||
super(UAC, logger);
|
super(UAC, logger);
|
||||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||||
|
|
||||||
// make sure the PL cache has been initialized with enough alleles
|
|
||||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles
|
|
||||||
UnifiedGenotyperEngine.calculatePLcache(3);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||||
|
|
@ -133,6 +132,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
||||||
}
|
}
|
||||||
builder.alleles(alleles);
|
builder.alleles(alleles);
|
||||||
|
|
||||||
|
// create the PL ordering to use based on the allele ordering.
|
||||||
|
final int[] PLordering = new int[numLikelihoods];
|
||||||
|
for ( int i = 0; i <= numAltAlleles; i++ ) {
|
||||||
|
for ( int j = i; j <= numAltAlleles; j++ ) {
|
||||||
|
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
|
||||||
|
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
|
||||||
|
PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// create the genotypes; no-call everyone for now
|
// create the genotypes; no-call everyone for now
|
||||||
final GenotypesContext genotypes = GenotypesContext.create();
|
final GenotypesContext genotypes = GenotypesContext.create();
|
||||||
final List<Allele> noCall = new ArrayList<Allele>();
|
final List<Allele> noCall = new ArrayList<Allele>();
|
||||||
|
|
@ -142,12 +151,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
||||||
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
||||||
final double[] myLikelihoods = new double[numLikelihoods];
|
final double[] myLikelihoods = new double[numLikelihoods];
|
||||||
|
|
||||||
int myLikelihoodsIndex = 0;
|
for ( int i = 0; i < numLikelihoods; i++ )
|
||||||
for ( int i = 0; i <= numAltAlleles; i++ ) {
|
myLikelihoods[i] = allLikelihoods[PLordering[i]];
|
||||||
for ( int j = i; j <= numAltAlleles; j++ ) {
|
|
||||||
myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// normalize in log space so that max element is zero.
|
// normalize in log space so that max element is zero.
|
||||||
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||||
|
|
@ -174,12 +179,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
||||||
final double[] likelihoods = sampleData.GL.getLikelihoods();
|
final double[] likelihoods = sampleData.GL.getLikelihoods();
|
||||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||||
if ( PLindexOfBestGL != PLindexOfRef ) {
|
if ( PLindexOfBestGL != PLindexOfRef ) {
|
||||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL];
|
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePairUsingDeprecatedOrdering(PLindexOfBestGL);
|
||||||
if ( alleles[0] != baseIndexOfRef )
|
if ( alleles.alleleIndex1 != baseIndexOfRef )
|
||||||
likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||||
// don't double-count it
|
// don't double-count it
|
||||||
if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] )
|
if ( alleles.alleleIndex2 != baseIndexOfRef && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||||
likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
likelihoodSums[alleles.alleleIndex2] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -203,7 +208,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
||||||
|
|
||||||
public class BAQedPileupElement extends PileupElement {
|
public class BAQedPileupElement extends PileupElement {
|
||||||
public BAQedPileupElement( final PileupElement PE ) {
|
public BAQedPileupElement( final PileupElement PE ) {
|
||||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isBeforeInsertion(), PE.isNextToSoftClip());
|
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletion(), PE.isAfterDeletion(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -104,10 +104,6 @@ public class UnifiedGenotyperEngine {
|
||||||
private final GenomeLocParser genomeLocParser;
|
private final GenomeLocParser genomeLocParser;
|
||||||
private final boolean BAQEnabledOnCMDLine;
|
private final boolean BAQEnabledOnCMDLine;
|
||||||
|
|
||||||
// a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
|
|
||||||
// the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)]
|
|
||||||
protected static int[][][] PLIndexToAlleleIndex;
|
|
||||||
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
@ -140,27 +136,6 @@ public class UnifiedGenotyperEngine {
|
||||||
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||||
|
|
||||||
filter.add(LOW_QUAL_FILTER_NAME);
|
filter.add(LOW_QUAL_FILTER_NAME);
|
||||||
calculatePLcache(UAC.MAX_ALTERNATE_ALLELES);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static void calculatePLcache(int maxAltAlleles) {
|
|
||||||
PLIndexToAlleleIndex = new int[maxAltAlleles+1][][];
|
|
||||||
PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} };
|
|
||||||
int numLikelihoods = 1;
|
|
||||||
|
|
||||||
// for each count of alternate alleles
|
|
||||||
for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) {
|
|
||||||
numLikelihoods += altAlleles + 1;
|
|
||||||
PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][];
|
|
||||||
int PLindex = 0;
|
|
||||||
|
|
||||||
// for all possible combinations of the 2 alt alleles
|
|
||||||
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
|
|
||||||
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
|
|
||||||
PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -794,21 +769,17 @@ public class UnifiedGenotyperEngine {
|
||||||
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
|
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
|
||||||
likelihoodIndexesToUse = new ArrayList<Integer>(30);
|
likelihoodIndexesToUse = new ArrayList<Integer>(30);
|
||||||
|
|
||||||
// make sure that we've cached enough data
|
|
||||||
if ( numOriginalAltAlleles > PLIndexToAlleleIndex.length - 1 )
|
|
||||||
calculatePLcache(numOriginalAltAlleles);
|
|
||||||
final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
|
|
||||||
|
|
||||||
final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles];
|
final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles];
|
||||||
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
|
for ( int i = 0; i < numOriginalAltAlleles; i++ ) {
|
||||||
if ( allelesToUse.contains(vc.getAlternateAllele(i)) )
|
if ( allelesToUse.contains(vc.getAlternateAllele(i)) )
|
||||||
altAlleleIndexToUse[i] = true;
|
altAlleleIndexToUse[i] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) {
|
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles);
|
||||||
final int[] alleles = PLcache[PLindex];
|
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
|
||||||
|
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||||
// consider this entry only if both of the alleles are good
|
// consider this entry only if both of the alleles are good
|
||||||
if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) )
|
if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) )
|
||||||
likelihoodIndexesToUse.add(PLindex);
|
likelihoodIndexesToUse.add(PLindex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -861,11 +832,11 @@ public class UnifiedGenotyperEngine {
|
||||||
protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final int numNewAltAlleles, final Map<String, Object> attrs) {
|
protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final int numNewAltAlleles, final Map<String, Object> attrs) {
|
||||||
// find the genotype with maximum likelihoods
|
// find the genotype with maximum likelihoods
|
||||||
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||||
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
|
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||||
|
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||||
myAlleles.add(allelesToUse.get(alleles[0]));
|
myAlleles.add(allelesToUse.get(alleles.alleleIndex1));
|
||||||
myAlleles.add(allelesToUse.get(alleles[1]));
|
myAlleles.add(allelesToUse.get(alleles.alleleIndex2));
|
||||||
|
|
||||||
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
|
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
|
||||||
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
|
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
|
||||||
|
|
|
||||||
|
|
@ -243,6 +243,19 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
|
if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
|
||||||
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
|
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
|
||||||
|
|
||||||
|
// TODO -- clean this up in a refactoring
|
||||||
|
// merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type)
|
||||||
|
if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) {
|
||||||
|
final List<VariantContext> refs = VCsByType.remove(VariantContext.Type.NO_VARIATION);
|
||||||
|
for ( VariantContext.Type type : VariantContext.Type.values() ) {
|
||||||
|
if ( VCsByType.containsKey(type) ) {
|
||||||
|
VCsByType.get(type).addAll(refs);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// iterate over the types so that it's deterministic
|
// iterate over the types so that it's deterministic
|
||||||
for (VariantContext.Type type : VariantContext.Type.values()) {
|
for (VariantContext.Type type : VariantContext.Type.values()) {
|
||||||
if (VCsByType.containsKey(type))
|
if (VCsByType.containsKey(type))
|
||||||
|
|
|
||||||
|
|
@ -216,12 +216,12 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName())));
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName())));
|
||||||
//hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
|
//hInfo.add(new VCFHeaderLine("source", "VariantsToVCF"));
|
||||||
//hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
//hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getID()));
|
||||||
|
|
||||||
allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY);
|
allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY);
|
||||||
for ( VCFHeaderLine field : hInfo ) {
|
for ( VCFHeaderLine field : hInfo ) {
|
||||||
if ( field instanceof VCFFormatHeaderLine) {
|
if ( field instanceof VCFFormatHeaderLine) {
|
||||||
allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getName());
|
allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getID());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,284 @@
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.ObjectOutputStream;
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utilities for bitset conversion
|
||||||
|
*
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/5/12
|
||||||
|
*/
|
||||||
|
public class BitSetUtils {
|
||||||
|
|
||||||
|
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
|
||||||
|
static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers)
|
||||||
|
static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers)
|
||||||
|
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an long out of a bitset
|
||||||
|
*
|
||||||
|
* @param bitSet the bitset
|
||||||
|
* @return a long from the bitset representation
|
||||||
|
*/
|
||||||
|
public static long longFrom(final BitSet bitSet) {
|
||||||
|
return longFrom(bitSet, NBITS_LONG_REPRESENTATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a short integer from a bitset
|
||||||
|
*
|
||||||
|
* @param bitSet the bitset
|
||||||
|
* @return a short from the bitset representation
|
||||||
|
*/
|
||||||
|
public static short shortFrom(final BitSet bitSet) {
|
||||||
|
return (short) longFrom(bitSet, NBITS_SHORT_REPRESENTATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cretes an integer with any number of bits (up to 64 -- long precision) from a bitset
|
||||||
|
*
|
||||||
|
* @param bitSet the bitset
|
||||||
|
* @param nBits the number of bits to be used for this representation
|
||||||
|
* @return an integer with nBits from the bitset representation
|
||||||
|
*/
|
||||||
|
public static long longFrom(final BitSet bitSet, final int nBits) {
|
||||||
|
long number = 0;
|
||||||
|
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0 && bitIndex <= nBits; bitIndex = bitSet.nextSetBit(bitIndex + 1))
|
||||||
|
number |= 1L << bitIndex;
|
||||||
|
|
||||||
|
return number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a BitSet representation of a given long
|
||||||
|
*
|
||||||
|
* @param number the number to turn into a bitset
|
||||||
|
* @return a bitset representation of the long
|
||||||
|
*/
|
||||||
|
public static BitSet bitSetFrom(long number) {
|
||||||
|
return bitSetFrom(number, NBITS_LONG_REPRESENTATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a BitSet representation of a given short
|
||||||
|
*
|
||||||
|
* @param number the number to turn into a bitset
|
||||||
|
* @return a bitset representation of the short
|
||||||
|
*/
|
||||||
|
public static BitSet bitSetFrom(short number) {
|
||||||
|
return bitSetFrom(number, NBITS_SHORT_REPRESENTATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision)
|
||||||
|
*
|
||||||
|
* @param number the number to turn into a bitset
|
||||||
|
* @param nBits the number of bits to use as precision for this conversion
|
||||||
|
* @return a bitset representation of the integer
|
||||||
|
*/
|
||||||
|
public static BitSet bitSetFrom(long number, int nBits) {
|
||||||
|
BitSet bitSet = new BitSet();
|
||||||
|
boolean isNegative = number < 0;
|
||||||
|
int bitIndex = 0;
|
||||||
|
while (number != 0) {
|
||||||
|
if (number % 2 != 0)
|
||||||
|
bitSet.set(bitIndex);
|
||||||
|
bitIndex++;
|
||||||
|
number /= 2;
|
||||||
|
}
|
||||||
|
if (isNegative) {
|
||||||
|
boolean foundFirstSetBit = false;
|
||||||
|
for (int i = bitSet.nextSetBit(0); i < nBits && i >= 0; i++) {
|
||||||
|
boolean bit = bitSet.get(i);
|
||||||
|
if (!foundFirstSetBit && bit)
|
||||||
|
foundFirstSetBit = true; // maintain all bits until the first 1 is found (inclusive)
|
||||||
|
else if (foundFirstSetBit)
|
||||||
|
bitSet.flip(i); // flip every other bit up to NBITS_REPRESENTATION
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bitSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a BitSet into the dna string representation.
|
||||||
|
*
|
||||||
|
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
||||||
|
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
||||||
|
* a bitSetFrom(BigNumber) method.
|
||||||
|
*
|
||||||
|
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
|
||||||
|
* base_10 representation of the sequence. This is important for us to know how to bring the number
|
||||||
|
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
|
||||||
|
* as 0's and leading 0's are omitted).
|
||||||
|
*
|
||||||
|
* quasi-canonical because A is represented by a 0, therefore,
|
||||||
|
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
|
||||||
|
* we have : 0, 1, 2, 3, 00, 01, 02, ...
|
||||||
|
*
|
||||||
|
* but we can correctly decode it because we know the final length.
|
||||||
|
*
|
||||||
|
* @param bitSet the bitset representation of the dna sequence
|
||||||
|
* @return the dna sequence represented by the bitset
|
||||||
|
*/
|
||||||
|
public static String dnaFrom(final BitSet bitSet) {
|
||||||
|
long number = longFrom(bitSet); // the base_10 representation of the bit set
|
||||||
|
if (number < 0)
|
||||||
|
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
|
||||||
|
|
||||||
|
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
||||||
|
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
||||||
|
|
||||||
|
String dna = "";
|
||||||
|
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
||||||
|
byte base = (byte) (number % 4);
|
||||||
|
switch (base) {
|
||||||
|
case 0:
|
||||||
|
dna = "A" + dna;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
dna = "C" + dna;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
dna = "G" + dna;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
dna = "T" + dna;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
number /= 4;
|
||||||
|
}
|
||||||
|
for (int j = dna.length(); j < length; j++)
|
||||||
|
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
||||||
|
|
||||||
|
return dna;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a BitSet representation of a given dna string.
|
||||||
|
*
|
||||||
|
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
||||||
|
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
||||||
|
* a bitSetFrom(BigNumber) method.
|
||||||
|
*
|
||||||
|
* The bit representation of a dna string is the simple:
|
||||||
|
* 0 A 4 AA 8 CA
|
||||||
|
* 1 C 5 AC ...
|
||||||
|
* 2 G 6 AG 1343 TTGGT
|
||||||
|
* 3 T 7 AT 1364 TTTTT
|
||||||
|
*
|
||||||
|
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
|
||||||
|
* preceded the string (with smaller lengths).
|
||||||
|
*
|
||||||
|
* @param dna the dna sequence
|
||||||
|
* @return the bitset representing the dna sequence
|
||||||
|
*/
|
||||||
|
public static BitSet bitSetFrom(String dna) {
|
||||||
|
if (dna.length() > MAX_DNA_CONTEXT)
|
||||||
|
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
|
||||||
|
|
||||||
|
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
||||||
|
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
|
||||||
|
for (int i = 0; i < dna.length(); i++) {
|
||||||
|
baseTen *= 4;
|
||||||
|
switch (dna.charAt(i)) {
|
||||||
|
case 'A':
|
||||||
|
baseTen += 0;
|
||||||
|
break;
|
||||||
|
case 'C':
|
||||||
|
baseTen += 1;
|
||||||
|
break;
|
||||||
|
case 'G':
|
||||||
|
baseTen += 2;
|
||||||
|
break;
|
||||||
|
case 'T':
|
||||||
|
baseTen += 3;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the number of bits necessary to represent a given number of elements
|
||||||
|
*
|
||||||
|
* @param numberOfElements the number of elements to represent (must be positive)
|
||||||
|
* @return the number of bits necessary to represent this many elements
|
||||||
|
*/
|
||||||
|
public static int numberOfBitsToRepresent(long numberOfElements) {
|
||||||
|
if (numberOfElements < 0)
|
||||||
|
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
|
||||||
|
|
||||||
|
if (numberOfElements == 1L)
|
||||||
|
return 1; // special case
|
||||||
|
|
||||||
|
int n = 0;
|
||||||
|
numberOfElements--;
|
||||||
|
while (numberOfElements > 0) {
|
||||||
|
numberOfElements = numberOfElements >> 1;
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the length of the DNA context for a given base 10 number
|
||||||
|
*
|
||||||
|
* It is important to know the length given the base 10 number to calculate the number of combinations
|
||||||
|
* and to disambiguate the "quasi-canonical" state.
|
||||||
|
*
|
||||||
|
* This method also calculates the number of combinations as a by-product, but since it memoizes the
|
||||||
|
* results, a subsequent call to combinationsFor(length) is O(1).
|
||||||
|
*
|
||||||
|
* @param number the base 10 representation of the bitset
|
||||||
|
* @return the length of the DNA context represented by this number
|
||||||
|
*/
|
||||||
|
private static int contextLengthFor(long number) {
|
||||||
|
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
|
||||||
|
long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it).
|
||||||
|
while (combinations <= number) { // find the length of the dna string (length)
|
||||||
|
length++;
|
||||||
|
combinations = combinationsFor(length); // calculate the next context
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The sum of all combinations of a context of a given length from length = 0 to length.
|
||||||
|
*
|
||||||
|
* Memoized implementation of sum(4^i) , where i=[0,length]
|
||||||
|
*
|
||||||
|
* @param length the length of the DNA context
|
||||||
|
* @return the sum of all combinations leading up to this context length.
|
||||||
|
*/
|
||||||
|
private static long combinationsFor(int length) {
|
||||||
|
if (length > MAX_DNA_CONTEXT)
|
||||||
|
throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length));
|
||||||
|
|
||||||
|
// only calculate the number of combinations if the table hasn't already cached the value
|
||||||
|
if (length > 0 && combinationsPerLength[length] == 0) {
|
||||||
|
long combinations = 0L;
|
||||||
|
for (int i = 1; i <= length; i++)
|
||||||
|
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
|
||||||
|
combinationsPerLength[length] = combinations;
|
||||||
|
}
|
||||||
|
return combinationsPerLength[length];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static byte[] sizeOf(Object obj) throws java.io.IOException
|
||||||
|
{
|
||||||
|
ByteArrayOutputStream byteObject = new ByteArrayOutputStream();
|
||||||
|
ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject);
|
||||||
|
objectOutputStream.writeObject(obj);
|
||||||
|
objectOutputStream.flush();
|
||||||
|
objectOutputStream.close();
|
||||||
|
byteObject.close();
|
||||||
|
|
||||||
|
return byteObject.toByteArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils;
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import net.sf.samtools.Cigar;
|
import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
|
|
@ -32,16 +33,14 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedHashMap;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class Haplotype {
|
public class Haplotype {
|
||||||
protected final byte[] bases;
|
protected final byte[] bases;
|
||||||
protected final double[] quals;
|
protected final double[] quals;
|
||||||
private GenomeLoc genomeLocation = null;
|
private GenomeLoc genomeLocation = null;
|
||||||
private boolean isReference = false;
|
private HashMap<String, double[]> readLikelihoodsPerSample = null;
|
||||||
|
private boolean isRef = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
|
||||||
|
|
@ -69,16 +68,35 @@ public class Haplotype {
|
||||||
this.genomeLocation = loc;
|
this.genomeLocation = loc;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Haplotype(byte[] bases, GenomeLoc loc, boolean isRef) {
|
|
||||||
this(bases, loc);
|
|
||||||
this.isReference = isRef;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals( Object h ) {
|
public boolean equals( Object h ) {
|
||||||
return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases);
|
return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addReadLikelihoods( final String sample, final double[] readLikelihoods ) {
|
||||||
|
if( readLikelihoodsPerSample == null ) {
|
||||||
|
readLikelihoodsPerSample = new HashMap<String, double[]>();
|
||||||
|
}
|
||||||
|
readLikelihoodsPerSample.put(sample, readLikelihoods);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Ensures({"result != null"})
|
||||||
|
public double[] getReadLikelihoods( final String sample ) {
|
||||||
|
return readLikelihoodsPerSample.get(sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getSampleKeySet() {
|
||||||
|
return readLikelihoodsPerSample.keySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isReference() {
|
||||||
|
return isRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setIsReference( boolean isRef ) {
|
||||||
|
this.isRef = isRef;
|
||||||
|
}
|
||||||
|
|
||||||
public double getQualitySum() {
|
public double getQualitySum() {
|
||||||
double s = 0;
|
double s = 0;
|
||||||
for (int k=0; k < bases.length; k++) {
|
for (int k=0; k < bases.length; k++) {
|
||||||
|
|
@ -87,6 +105,7 @@ public class Haplotype {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
String returnString = "";
|
String returnString = "";
|
||||||
for(int iii = 0; iii < bases.length; iii++) {
|
for(int iii = 0; iii < bases.length; iii++) {
|
||||||
|
|
@ -110,10 +129,6 @@ public class Haplotype {
|
||||||
return genomeLocation.getStop();
|
return genomeLocation.getStop();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isReference() {
|
|
||||||
return isReference;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"})
|
@Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"})
|
||||||
public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) {
|
public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) {
|
||||||
|
|
||||||
|
|
@ -208,13 +223,14 @@ public class Haplotype {
|
||||||
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
|
String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
|
||||||
haplotypeString = haplotypeString.substring(0,haplotypeSize);
|
haplotypeString = haplotypeString.substring(0,haplotypeSize);
|
||||||
|
|
||||||
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
|
haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return haplotypeMap;
|
return haplotypeMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BUGBUG: copied from ReadClipper and slightly modified since we don't have the data in a GATKSAMRecord
|
||||||
private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) {
|
private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) {
|
||||||
int readBases = 0;
|
int readBases = 0;
|
||||||
int refBases = 0;
|
int refBases = 0;
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
import java.math.BigDecimal;
|
import java.math.BigDecimal;
|
||||||
|
|
@ -1527,124 +1526,4 @@ public class MathUtils {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an integer out of a bitset
|
|
||||||
*
|
|
||||||
* @param bitSet the bitset
|
|
||||||
* @return an integer with the bitset representation
|
|
||||||
*/
|
|
||||||
public static long intFrom(final BitSet bitSet) {
|
|
||||||
long number = 0;
|
|
||||||
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1))
|
|
||||||
number |= 1L << bitIndex;
|
|
||||||
|
|
||||||
return number;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a BitSet representation of a given integer
|
|
||||||
*
|
|
||||||
* @param number the number to turn into a bitset
|
|
||||||
* @return a bitset representation of the integer
|
|
||||||
*/
|
|
||||||
public static BitSet bitSetFrom(long number) {
|
|
||||||
BitSet bitSet = new BitSet();
|
|
||||||
int bitIndex = 0;
|
|
||||||
while (number > 0) {
|
|
||||||
if (number%2 > 0)
|
|
||||||
bitSet.set(bitIndex);
|
|
||||||
bitIndex++;
|
|
||||||
number /= 2;
|
|
||||||
}
|
|
||||||
return bitSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts a BitSet into the dna string representation.
|
|
||||||
*
|
|
||||||
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
|
||||||
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
|
||||||
* a bitSetFrom(BigNumber) method.
|
|
||||||
*
|
|
||||||
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
|
|
||||||
* base_10 representation of the sequence. This is important for us to know how to bring the number
|
|
||||||
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
|
|
||||||
* as 0's and leading 0's are omitted).
|
|
||||||
*
|
|
||||||
* quasi-canonical because A is represented by a 0, therefore,
|
|
||||||
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
|
|
||||||
* we have : 0, 1, 2, 3, 00, 01, 02, ...
|
|
||||||
*
|
|
||||||
* but we can correctly decode it because we know the final length.
|
|
||||||
*
|
|
||||||
* @param bitSet the bitset representation of the dna sequence
|
|
||||||
* @return the dna sequence represented by the bitset
|
|
||||||
*/
|
|
||||||
public static String dnaFrom(final BitSet bitSet) {
|
|
||||||
long number = intFrom(bitSet); // the base_10 representation of the bit set
|
|
||||||
long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later)
|
|
||||||
long nextContext = 4; // the next context (we advance it so we know which one was preceding it).
|
|
||||||
int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
|
|
||||||
while (nextContext <= number) { // find the length of the dna string (i)
|
|
||||||
preContext = nextContext; // keep track of the number of combinations in the preceding context
|
|
||||||
nextContext += Math.pow(4, ++i);// calculate the next context
|
|
||||||
}
|
|
||||||
number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
|
||||||
|
|
||||||
String dna = "";
|
|
||||||
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
|
||||||
byte base = (byte) (number % 4);
|
|
||||||
switch (base) {
|
|
||||||
case 0 : dna = "A" + dna; break;
|
|
||||||
case 1 : dna = "C" + dna; break;
|
|
||||||
case 2 : dna = "G" + dna; break;
|
|
||||||
case 3 : dna = "T" + dna; break;
|
|
||||||
}
|
|
||||||
number /= 4;
|
|
||||||
}
|
|
||||||
for (int j = dna.length(); j < i; j++)
|
|
||||||
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
|
||||||
|
|
||||||
return dna;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a BitSet representation of a given dna string.
|
|
||||||
*
|
|
||||||
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
|
||||||
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
|
||||||
* a bitSetFrom(BigNumber) method.
|
|
||||||
*
|
|
||||||
* The bit representation of a dna string is the simple:
|
|
||||||
* 0 A 4 AA 8 CA
|
|
||||||
* 1 C 5 AC ...
|
|
||||||
* 2 G 6 AG 1343 TTGGT
|
|
||||||
* 3 T 7 AT 1364 TTTTT
|
|
||||||
*
|
|
||||||
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
|
|
||||||
* preceded the string (with smaller lengths).
|
|
||||||
*
|
|
||||||
* @param dna the dna sequence
|
|
||||||
* @return the bitset representing the dna sequence
|
|
||||||
*/
|
|
||||||
public static BitSet bitSetFrom(String dna) {
|
|
||||||
if (dna.length() > 31)
|
|
||||||
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length()));
|
|
||||||
|
|
||||||
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
|
||||||
long preContext = 0; // the sum of all combinations that preceded the length of the dna string
|
|
||||||
for (int i=0; i<dna.length(); i++) {
|
|
||||||
baseTen *= 4;
|
|
||||||
switch(dna.charAt(i)) {
|
|
||||||
case 'A': baseTen += 0; break;
|
|
||||||
case 'C': baseTen += 1; break;
|
|
||||||
case 'G': baseTen += 2; break;
|
|
||||||
case 'T': baseTen += 3; break;
|
|
||||||
}
|
|
||||||
if (i>0)
|
|
||||||
preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...)
|
|
||||||
}
|
|
||||||
|
|
||||||
return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,16 @@ public class QualityUtils {
|
||||||
for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i);
|
for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static double qualToErrorProbLog10Cache[] = new double[256];
|
||||||
|
static {
|
||||||
|
for (int i = 0; i < 256; i++) qualToErrorProbLog10Cache[i] = qualToErrorProbLog10Raw(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static double qualToProbLog10Cache[] = new double[256];
|
||||||
|
static {
|
||||||
|
for (int i = 0; i < 256; i++) qualToProbLog10Cache[i] = qualToProbLog10Raw(i);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Private constructor. No instantiating this class!
|
* Private constructor. No instantiating this class!
|
||||||
*/
|
*/
|
||||||
|
|
@ -31,7 +41,7 @@ public class QualityUtils {
|
||||||
* Convert a quality score to a probability. This is the Phred-style
|
* Convert a quality score to a probability. This is the Phred-style
|
||||||
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
||||||
*
|
*
|
||||||
* @param qual a quality score (0-40)
|
* @param qual a quality score (0-255)
|
||||||
* @return a probability (0.0-1.0)
|
* @return a probability (0.0-1.0)
|
||||||
*/
|
*/
|
||||||
static public double qualToProb(byte qual) {
|
static public double qualToProb(byte qual) {
|
||||||
|
|
@ -42,6 +52,14 @@ public class QualityUtils {
|
||||||
return 1.0 - Math.pow(10.0, qual/(-10.0));
|
return 1.0 - Math.pow(10.0, qual/(-10.0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static private double qualToProbLog10Raw(int qual) {
|
||||||
|
return Math.log10(1.0 - qualToErrorProbRaw(qual));
|
||||||
|
}
|
||||||
|
|
||||||
|
static public double qualToProbLog10(byte qual) {
|
||||||
|
return qualToProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a quality score to a probability of error. This is the Phred-style
|
* Convert a quality score to a probability of error. This is the Phred-style
|
||||||
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
||||||
|
|
@ -57,14 +75,14 @@ public class QualityUtils {
|
||||||
return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||||
}
|
}
|
||||||
|
|
||||||
static public double[] qualArrayToLog10ErrorProb(byte[] quals) {
|
static private double qualToErrorProbLog10Raw(int qual) {
|
||||||
double[] returnArray = new double[quals.length];
|
return ((double) qual)/-10.0;
|
||||||
for( int iii = 0; iii < quals.length; iii++ ) {
|
|
||||||
returnArray[iii] = ((double) quals[iii])/-10.0;
|
|
||||||
}
|
|
||||||
return returnArray;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static public double qualToErrorProbLog10(byte qual) {
|
||||||
|
return qualToErrorProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a probability to a quality score. Note, this is capped at Q40.
|
* Convert a probability to a quality score. Note, this is capped at Q40.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -25,9 +25,14 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils;
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMProgramRecord;
|
||||||
import net.sf.samtools.util.StringUtil;
|
import net.sf.samtools.util.StringUtil;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||||
|
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -668,4 +673,34 @@ public class Utils {
|
||||||
array[i] = value;
|
array[i] = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) {
|
||||||
|
final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME);
|
||||||
|
|
||||||
|
SAMFileHeader header = toolkit.getSAMFileHeader();
|
||||||
|
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||||
|
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||||
|
for ( SAMProgramRecord record : oldRecords )
|
||||||
|
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS )
|
||||||
|
newRecords.add(record);
|
||||||
|
|
||||||
|
newRecords.add(programRecord);
|
||||||
|
header.setProgramRecords(newRecords);
|
||||||
|
|
||||||
|
writer.writeHeader(header);
|
||||||
|
writer.setPresorted(preSorted);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) {
|
||||||
|
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||||
|
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||||
|
try {
|
||||||
|
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
||||||
|
programRecord.setProgramVersion(version);
|
||||||
|
} catch (MissingResourceException e) {
|
||||||
|
// couldn't care less if the resource is missing...
|
||||||
|
}
|
||||||
|
programRecord.setCommandLine(toolkit.createApproximateCommandLineArgumentString(toolkit, walker));
|
||||||
|
return programRecord;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,11 @@ public class ActiveRegion implements HasGenomeLocation {
|
||||||
fullExtentReferenceLoc = extendedLoc;
|
fullExtentReferenceLoc = extendedLoc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "ActiveRegion " + activeRegionLoc.toString();
|
||||||
|
}
|
||||||
|
|
||||||
// add each read to the bin and extend the reference genome activeRegionLoc if needed
|
// add each read to the bin and extend the reference genome activeRegionLoc if needed
|
||||||
public void add( final GATKSAMRecord read ) {
|
public void add( final GATKSAMRecord read ) {
|
||||||
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
|
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
|
||||||
|
|
@ -78,4 +83,13 @@ public class ActiveRegion implements HasGenomeLocation {
|
||||||
public void clearReads() { reads.clear(); }
|
public void clearReads() { reads.clear(); }
|
||||||
public void remove( final GATKSAMRecord read ) { reads.remove( read ); }
|
public void remove( final GATKSAMRecord read ) { reads.remove( read ); }
|
||||||
public void removeAll( final ArrayList<GATKSAMRecord> readsToRemove ) { reads.removeAll( readsToRemove ); }
|
public void removeAll( final ArrayList<GATKSAMRecord> readsToRemove ) { reads.removeAll( readsToRemove ); }
|
||||||
|
|
||||||
|
public boolean equalExceptReads(final ActiveRegion other) {
|
||||||
|
if ( ! activeRegionLoc.equals(other.activeRegionLoc)) return false;
|
||||||
|
if ( isActive != other.isActive ) return false;
|
||||||
|
if ( genomeLocParser != other.genomeLocParser ) return false;
|
||||||
|
if ( extension != other.extension ) return false;
|
||||||
|
if ( ! extendedLoc.equals(other.extendedLoc) ) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils.activeregion;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class holding information about per-base activity scores for the
|
||||||
|
* active region traversal
|
||||||
|
*
|
||||||
|
* @author Mark DePristo
|
||||||
|
* @since Date created
|
||||||
|
*/
|
||||||
|
public class ActivityProfile {
|
||||||
|
final GenomeLocParser parser;
|
||||||
|
final boolean presetRegions;
|
||||||
|
GenomeLoc regionStartLoc = null;
|
||||||
|
final List<Double> isActiveList;
|
||||||
|
|
||||||
|
private GenomeLoc lastLoc = null;
|
||||||
|
|
||||||
|
// todo -- add upfront the start and stop of the intervals
|
||||||
|
// todo -- check that no regions are unexpectedly missing
|
||||||
|
// todo -- add unit tests
|
||||||
|
// TODO -- own preset regions
|
||||||
|
public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) {
|
||||||
|
this(parser, presetRegions, new ArrayList<Double>(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List<Double> isActiveList, final GenomeLoc regionStartLoc) {
|
||||||
|
this.parser = parser;
|
||||||
|
this.presetRegions = presetRegions;
|
||||||
|
this.isActiveList = isActiveList;
|
||||||
|
this.regionStartLoc = regionStartLoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(final GenomeLoc loc, final double score) {
|
||||||
|
if ( loc.size() != 1 )
|
||||||
|
throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" );
|
||||||
|
if ( lastLoc != null && loc.getStart() != lastLoc.getStop() + 1 )
|
||||||
|
throw new ReviewedStingException("Bad add call to ActivityProfile: lastLoc added " + lastLoc + " and next is " + loc);
|
||||||
|
isActiveList.add(score);
|
||||||
|
if( regionStartLoc == null ) {
|
||||||
|
regionStartLoc = loc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return isActiveList.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Band pass this ActivityProfile, producing a new profile that's band pass filtered
|
||||||
|
* @return a new ActivityProfile that's the band-pass filtered version of this profile
|
||||||
|
*/
|
||||||
|
public ActivityProfile bandPassFilter() {
|
||||||
|
final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
|
||||||
|
final Double[] filteredProbArray = new Double[activeProbArray.length];
|
||||||
|
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author
|
||||||
|
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||||
|
double maxVal = 0;
|
||||||
|
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
||||||
|
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
||||||
|
}
|
||||||
|
filteredProbArray[iii] = maxVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Partition this profile into active regions
|
||||||
|
* @param activeRegionExtension
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension ) {
|
||||||
|
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author
|
||||||
|
final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author
|
||||||
|
|
||||||
|
if( isActiveList.size() == 0 ) {
|
||||||
|
// no elements in the active list, just return an empty one
|
||||||
|
return Collections.emptyList();
|
||||||
|
} else if( isActiveList.size() == 1 ) {
|
||||||
|
// there's a single element, it's either active or inactive
|
||||||
|
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
||||||
|
final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension );
|
||||||
|
return Collections.singletonList(region);
|
||||||
|
} else {
|
||||||
|
// there are 2+ elements, divide these up into regions
|
||||||
|
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||||
|
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
||||||
|
int curStart = 0;
|
||||||
|
for(int iii = 1; iii < isActiveList.size(); iii++ ) {
|
||||||
|
final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
|
||||||
|
if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
|
||||||
|
returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) );
|
||||||
|
isActive = thisStatus;
|
||||||
|
curStart = iii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region
|
||||||
|
|
||||||
|
return returnList;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper routine to create an active region based on our current start and end offsets
|
||||||
|
* @param isActive should the region be active?
|
||||||
|
* @param curStart offset (0-based) from the start of this region
|
||||||
|
* @param curEnd offset (0-based) from the start of this region
|
||||||
|
* @param activeRegionExtension
|
||||||
|
* @return a fully initialized ActiveRegion with the above properties
|
||||||
|
*/
|
||||||
|
private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) {
|
||||||
|
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
|
||||||
|
return new ActiveRegion( loc, isActive, parser, activeRegionExtension );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -4,7 +4,7 @@ import com.google.java.contract.Requires;
|
||||||
import net.sf.samtools.Cigar;
|
import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
|
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
@ -320,8 +320,8 @@ public class ClippingOp {
|
||||||
byte[] newBaseDeletionQuals = new byte[newLength];
|
byte[] newBaseDeletionQuals = new byte[newLength];
|
||||||
System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength);
|
System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength);
|
||||||
System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength);
|
System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength);
|
||||||
hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION);
|
hardClippedRead.setBaseQualities(newBaseInsertionQuals, EventType.BASE_INSERTION);
|
||||||
hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION);
|
hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION);
|
||||||
}
|
}
|
||||||
|
|
||||||
return hardClippedRead;
|
return hardClippedRead;
|
||||||
|
|
|
||||||
|
|
@ -231,15 +231,16 @@ public class ReadClipper {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual.
|
* Clips any contiguous tail (left, right or both) with base quality lower than lowQual using the desired algorithm.
|
||||||
*
|
*
|
||||||
* This function will look for low quality tails and hard clip them away. A low quality tail
|
* This function will look for low quality tails and hard clip them away. A low quality tail
|
||||||
* ends when a base has base quality greater than lowQual.
|
* ends when a base has base quality greater than lowQual.
|
||||||
*
|
*
|
||||||
|
* @param algorithm the algorithm to use (HardClip, SoftClip, Write N's,...)
|
||||||
* @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped
|
* @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped
|
||||||
* @return a new read without low quality tails
|
* @return a new read without low quality tails
|
||||||
*/
|
*/
|
||||||
private GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
|
private GATKSAMRecord clipLowQualEnds(ClippingRepresentation algorithm, byte lowQual) {
|
||||||
if (read.isEmpty())
|
if (read.isEmpty())
|
||||||
return read;
|
return read;
|
||||||
|
|
||||||
|
|
@ -254,7 +255,6 @@ public class ReadClipper {
|
||||||
// if the entire read should be clipped, then return an empty read.
|
// if the entire read should be clipped, then return an empty read.
|
||||||
if (leftClipIndex > rightClipIndex)
|
if (leftClipIndex > rightClipIndex)
|
||||||
return GATKSAMRecord.emptyRead(read);
|
return GATKSAMRecord.emptyRead(read);
|
||||||
// return (new GATKSAMRecord(read.getHeader()));
|
|
||||||
|
|
||||||
if (rightClipIndex < read.getReadLength() - 1) {
|
if (rightClipIndex < read.getReadLength() - 1) {
|
||||||
this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1));
|
this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1));
|
||||||
|
|
@ -262,11 +262,18 @@ public class ReadClipper {
|
||||||
if (leftClipIndex > 0 ) {
|
if (leftClipIndex > 0 ) {
|
||||||
this.addOp(new ClippingOp(0, leftClipIndex - 1));
|
this.addOp(new ClippingOp(0, leftClipIndex - 1));
|
||||||
}
|
}
|
||||||
return this.clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
return this.clipRead(algorithm);
|
||||||
|
}
|
||||||
|
|
||||||
|
private GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
|
||||||
|
return this.clipLowQualEnds(ClippingRepresentation.HARDCLIP_BASES, lowQual);
|
||||||
}
|
}
|
||||||
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
|
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
|
||||||
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
|
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
|
||||||
}
|
}
|
||||||
|
public static GATKSAMRecord clipLowQualEnds(GATKSAMRecord read, byte lowQual, ClippingRepresentation algorithm) {
|
||||||
|
return (new ReadClipper(read)).clipLowQualEnds(algorithm, lowQual);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -154,18 +154,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
||||||
throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
|
throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if ( str.startsWith("##INFO=") ) {
|
if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) {
|
||||||
VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
|
final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
|
||||||
metaData.add(info);
|
metaData.add(info);
|
||||||
infoFields.put(info.getName(), info.getType());
|
infoFields.put(info.getID(), info.getType());
|
||||||
} else if ( str.startsWith("##FILTER=") ) {
|
} else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) {
|
||||||
VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version);
|
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version);
|
||||||
metaData.add(filter);
|
metaData.add(filter);
|
||||||
filterFields.add(filter.getName());
|
filterFields.add(filter.getID());
|
||||||
} else if ( str.startsWith("##FORMAT=") ) {
|
} else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) {
|
||||||
VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version);
|
final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version);
|
||||||
metaData.add(format);
|
metaData.add(format);
|
||||||
formatFields.put(format.getName(), format.getType());
|
formatFields.put(format.getID(), format.getType());
|
||||||
|
} else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) {
|
||||||
|
final VCFSimpleHeaderLine contig = new VCFSimpleHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), null);
|
||||||
|
metaData.add(contig);
|
||||||
|
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
|
||||||
|
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"));
|
||||||
|
metaData.add(alt);
|
||||||
} else {
|
} else {
|
||||||
int equals = str.indexOf("=");
|
int equals = str.indexOf("=");
|
||||||
if ( equals != -1 )
|
if ( equals != -1 )
|
||||||
|
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
||||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author ebanks
|
|
||||||
* A class representing a key=value entry for ALT fields in the VCF header
|
|
||||||
*/
|
|
||||||
public class VCFAltHeaderLine extends VCFSimpleHeaderLine {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* create a VCF filter header line
|
|
||||||
*
|
|
||||||
* @param name the name for this header line
|
|
||||||
* @param description the description for this header line
|
|
||||||
*/
|
|
||||||
public VCFAltHeaderLine(String name, String description) {
|
|
||||||
super(name, description, SupportedHeaderLineType.ALT);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* create a VCF info header line
|
|
||||||
*
|
|
||||||
* @param line the header line
|
|
||||||
* @param version the vcf header version
|
|
||||||
*/
|
|
||||||
protected VCFAltHeaderLine(String line, VCFHeaderVersion version) {
|
|
||||||
super(line, version, SupportedHeaderLineType.ALT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -34,7 +34,7 @@ import java.util.Map;
|
||||||
/**
|
/**
|
||||||
* a base class for compound header lines, which include info lines and format lines (so far)
|
* a base class for compound header lines, which include info lines and format lines (so far)
|
||||||
*/
|
*/
|
||||||
public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
|
||||||
public enum SupportedHeaderLineType {
|
public enum SupportedHeaderLineType {
|
||||||
INFO(true), FORMAT(false);
|
INFO(true), FORMAT(false);
|
||||||
|
|
||||||
|
|
@ -52,7 +52,7 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
|
||||||
private VCFHeaderLineType type;
|
private VCFHeaderLineType type;
|
||||||
|
|
||||||
// access methods
|
// access methods
|
||||||
public String getName() { return name; }
|
public String getID() { return name; }
|
||||||
public String getDescription() { return description; }
|
public String getDescription() { return description; }
|
||||||
public VCFHeaderLineType getType() { return type; }
|
public VCFHeaderLineType getType() { return type; }
|
||||||
public VCFHeaderLineCount getCountType() { return countType; }
|
public VCFHeaderLineCount getCountType() { return countType; }
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,13 @@ public final class VCFConstants {
|
||||||
public static final String PHASED_SWITCH_PROB_v3 = "\\";
|
public static final String PHASED_SWITCH_PROB_v3 = "\\";
|
||||||
public static final String PHASING_TOKENS = "/|\\";
|
public static final String PHASING_TOKENS = "/|\\";
|
||||||
|
|
||||||
|
// header lines
|
||||||
|
public static final String FILTER_HEADER_START = "##FILTER";
|
||||||
|
public static final String FORMAT_HEADER_START = "##FORMAT";
|
||||||
|
public static final String INFO_HEADER_START = "##INFO";
|
||||||
|
public static final String ALT_HEADER_START = "##ALT";
|
||||||
|
public static final String CONTIG_HEADER_START = "##contig";
|
||||||
|
|
||||||
// old indel alleles
|
// old indel alleles
|
||||||
public static final char DELETION_ALLELE_v3 = 'D';
|
public static final char DELETION_ALLELE_v3 = 'D';
|
||||||
public static final char INSERTION_ALLELE_v3 = 'I';
|
public static final char INSERTION_ALLELE_v3 = 'I';
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author ebanks
|
* @author ebanks
|
||||||
* A class representing a key=value entry for FILTER fields in the VCF header
|
* A class representing a key=value entry for FILTER fields in the VCF header
|
||||||
|
|
@ -13,7 +15,7 @@ public class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
|
||||||
* @param description the description for this header line
|
* @param description the description for this header line
|
||||||
*/
|
*/
|
||||||
public VCFFilterHeaderLine(String name, String description) {
|
public VCFFilterHeaderLine(String name, String description) {
|
||||||
super(name, description, SupportedHeaderLineType.FILTER);
|
super("FILTER", name, description);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -23,6 +25,6 @@ public class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
|
||||||
* @param version the vcf header version
|
* @param version the vcf header version
|
||||||
*/
|
*/
|
||||||
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||||
super(line, version, SupportedHeaderLineType.FILTER);
|
super(line, version, "FILTER", Arrays.asList("ID", "Description"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
||||||
|
|
||||||
|
|
||||||
import org.broad.tribble.util.ParsingUtils;
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -126,11 +125,11 @@ public class VCFHeader {
|
||||||
for ( VCFHeaderLine line : mMetaData ) {
|
for ( VCFHeaderLine line : mMetaData ) {
|
||||||
if ( line instanceof VCFInfoHeaderLine ) {
|
if ( line instanceof VCFInfoHeaderLine ) {
|
||||||
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
||||||
mInfoMetaData.put(infoLine.getName(), infoLine);
|
mInfoMetaData.put(infoLine.getID(), infoLine);
|
||||||
}
|
}
|
||||||
else if ( line instanceof VCFFormatHeaderLine ) {
|
else if ( line instanceof VCFFormatHeaderLine ) {
|
||||||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||||
mFormatMetaData.put(formatLine.getName(), formatLine);
|
mFormatMetaData.put(formatLine.getID(), formatLine);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
mOtherMetaData.put(line.getKey(), line);
|
mOtherMetaData.put(line.getKey(), line);
|
||||||
|
|
|
||||||
|
|
@ -73,10 +73,14 @@ class VCF4Parser implements VCFLineParser {
|
||||||
|
|
||||||
// validate the tags against the expected list
|
// validate the tags against the expected list
|
||||||
index = 0;
|
index = 0;
|
||||||
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
if ( expectedTagOrder != null ) {
|
||||||
for (String str : ret.keySet()) {
|
if ( ret.size() > expectedTagOrder.size() )
|
||||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
||||||
index++;
|
for ( String str : ret.keySet() ) {
|
||||||
|
if ( !expectedTagOrder.get(index).equals(str) )
|
||||||
|
throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||||
|
index++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||||
|
|
||||||
/** an interface for named header lines **/
|
/** an interface for ID-based header lines **/
|
||||||
public interface VCFNamedHeaderLine {
|
public interface VCFIDHeaderLine {
|
||||||
String getName();
|
String getID();
|
||||||
}
|
}
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -9,34 +9,35 @@ import java.util.Map;
|
||||||
* @author ebanks
|
* @author ebanks
|
||||||
* A class representing a key=value entry for simple VCF header types
|
* A class representing a key=value entry for simple VCF header types
|
||||||
*/
|
*/
|
||||||
public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
|
||||||
|
|
||||||
public enum SupportedHeaderLineType {
|
|
||||||
FILTER, ALT;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String name;
|
private String name;
|
||||||
private String description;
|
private Map<String, String> genericFields = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
// our type of line, i.e. filter, alt, etc
|
|
||||||
private final SupportedHeaderLineType lineType;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a VCF filter header line
|
* create a VCF filter header line
|
||||||
*
|
*
|
||||||
* @param name the name for this header line
|
* @param key the key for this header line
|
||||||
* @param description the description for this header line
|
* @param name the name for this header line
|
||||||
* @param lineType the header line type
|
* @param genericFields other fields for this header line
|
||||||
*/
|
*/
|
||||||
public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) {
|
public VCFSimpleHeaderLine(String key, String name, Map<String, String> genericFields) {
|
||||||
super(lineType.toString(), "");
|
super(key, "");
|
||||||
this.lineType = lineType;
|
initialize(name, genericFields);
|
||||||
this.name = name;
|
}
|
||||||
this.description = description;
|
|
||||||
|
|
||||||
if ( name == null || description == null )
|
/**
|
||||||
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description ));
|
* create a VCF filter header line
|
||||||
|
*
|
||||||
|
* @param key the key for this header line
|
||||||
|
* @param name the name for this header line
|
||||||
|
* @param description description for this header line
|
||||||
|
*/
|
||||||
|
public VCFSimpleHeaderLine(String key, String name, String description) {
|
||||||
|
super(key, "");
|
||||||
|
Map<String, String> map = new LinkedHashMap<String, String>(1);
|
||||||
|
map.put("Description", description);
|
||||||
|
initialize(name, map);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -44,38 +45,50 @@ public abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNa
|
||||||
*
|
*
|
||||||
* @param line the header line
|
* @param line the header line
|
||||||
* @param version the vcf header version
|
* @param version the vcf header version
|
||||||
* @param lineType the header line type
|
* @param key the key for this header line
|
||||||
|
* @param expectedTagOrdering the tag ordering expected for this header line
|
||||||
*/
|
*/
|
||||||
protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, String key, List<String> expectedTagOrdering) {
|
||||||
super(lineType.toString(), "");
|
super(key, "");
|
||||||
this.lineType = lineType;
|
Map<String, String> mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering);
|
||||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
|
||||||
name = mapping.get("ID");
|
name = mapping.get("ID");
|
||||||
description = mapping.get("Description");
|
initialize(name, mapping);
|
||||||
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
|
}
|
||||||
description = UNBOUND_DESCRIPTION;
|
|
||||||
|
protected void initialize(String name, Map<String, String> genericFields) {
|
||||||
|
if ( name == null || genericFields == null || genericFields.isEmpty() )
|
||||||
|
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name));
|
||||||
|
|
||||||
|
this.name = name;
|
||||||
|
this.genericFields.putAll(genericFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String toStringEncoding() {
|
protected String toStringEncoding() {
|
||||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
Map<String, Object> map = new LinkedHashMap<String, Object>();
|
||||||
map.put("ID", name);
|
map.put("ID", name);
|
||||||
map.put("Description", description);
|
map.putAll(genericFields);
|
||||||
return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map);
|
return getKey() + "=" + VCFHeaderLine.toStringEncoding(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if ( !(o instanceof VCFSimpleHeaderLine) )
|
if ( !(o instanceof VCFSimpleHeaderLine) )
|
||||||
return false;
|
return false;
|
||||||
VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o;
|
VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o;
|
||||||
return name.equals(other.name) &&
|
if ( !name.equals(other.name) || genericFields.size() != other.genericFields.size() )
|
||||||
description.equals(other.description);
|
return false;
|
||||||
|
for ( Map.Entry<String, String> entry : genericFields.entrySet() ) {
|
||||||
|
if ( !entry.getValue().equals(other.genericFields.get(entry.getKey())) )
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getName() {
|
public String getID() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDescription() {
|
public Map<String, String> getGenericFields() {
|
||||||
return description;
|
return genericFields;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -155,10 +155,10 @@ public class VCFUtils {
|
||||||
for ( VCFHeader source : headers ) {
|
for ( VCFHeader source : headers ) {
|
||||||
//System.out.printf("Merging in header %s%n", source);
|
//System.out.printf("Merging in header %s%n", source);
|
||||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||||
String key = line.getKey();
|
|
||||||
|
|
||||||
if ( line instanceof VCFNamedHeaderLine)
|
String key = line.getKey();
|
||||||
key = key + "" + ((VCFNamedHeaderLine) line).getName();
|
if ( line instanceof VCFIDHeaderLine )
|
||||||
|
key = key + "-" + ((VCFIDHeaderLine)line).getID();
|
||||||
|
|
||||||
if ( map.containsKey(key) ) {
|
if ( map.containsKey(key) ) {
|
||||||
VCFHeaderLine other = map.get(key);
|
VCFHeaderLine other = map.get(key);
|
||||||
|
|
@ -166,8 +166,8 @@ public class VCFUtils {
|
||||||
continue;
|
continue;
|
||||||
else if ( ! line.getClass().equals(other.getClass()) )
|
else if ( ! line.getClass().equals(other.getClass()) )
|
||||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||||
else if ( line instanceof VCFFilterHeaderLine) {
|
else if ( line instanceof VCFFilterHeaderLine ) {
|
||||||
String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName();
|
String lineName = ((VCFFilterHeaderLine) line).getID(); String otherName = ((VCFFilterHeaderLine) other).getID();
|
||||||
if ( ! lineName.equals(otherName) )
|
if ( ! lineName.equals(otherName) )
|
||||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
|
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
|
@ -203,8 +203,8 @@ public class FragmentUtils {
|
||||||
insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop];
|
insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop];
|
||||||
deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop];
|
deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop];
|
||||||
}
|
}
|
||||||
returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION );
|
returnRead.setBaseQualities( insertionQuals, EventType.BASE_INSERTION );
|
||||||
returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION );
|
returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION );
|
||||||
}
|
}
|
||||||
|
|
||||||
final ArrayList<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>();
|
final ArrayList<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>();
|
||||||
|
|
|
||||||
|
|
@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
||||||
for (int i = 0; i < reads.size(); i++) {
|
for (int i = 0; i < reads.size(); i++) {
|
||||||
GATKSAMRecord read = reads.get(i);
|
GATKSAMRecord read = reads.get(i);
|
||||||
int offset = offsets.get(i);
|
int offset = offsets.get(i);
|
||||||
pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
pileup.add(createNewPileupElement(read, offset, false, false, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||||
}
|
}
|
||||||
|
|
||||||
return pileup;
|
return pileup;
|
||||||
|
|
@ -196,7 +196,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
||||||
|
|
||||||
UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>();
|
UnifiedPileupElementTracker<PE> pileup = new UnifiedPileupElementTracker<PE>();
|
||||||
for (GATKSAMRecord read : reads) {
|
for (GATKSAMRecord read : reads) {
|
||||||
pileup.add(createNewPileupElement(read, offset, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
pileup.add(createNewPileupElement(read, offset, false, false, false, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important
|
||||||
}
|
}
|
||||||
|
|
||||||
return pileup;
|
return pileup;
|
||||||
|
|
@ -204,8 +204,8 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
|
||||||
|
|
||||||
protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker);
|
protected abstract AbstractReadBackedPileup<RBP, PE> createNewPileup(GenomeLoc loc, PileupElementTracker<PE> pileupElementTracker);
|
||||||
|
|
||||||
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip);
|
protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip);
|
||||||
protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip, String nextEventBases, int nextEventLength );
|
protected abstract PE createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength );
|
||||||
|
|
||||||
// --------------------------------------------------------
|
// --------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement {
|
||||||
|
|
||||||
|
|
||||||
public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) {
|
public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) {
|
||||||
super(read, offset, type == Type.DELETION, false, false, false,null,-1); // extended events are slated for removal
|
super(read, offset, type == Type.DELETION, false, false, false, false, false, null, -1); // extended events are slated for removal
|
||||||
this.read = read;
|
this.read = read;
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.eventLength = eventLength;
|
this.eventLength = eventLength;
|
||||||
|
|
|
||||||
|
|
@ -21,15 +21,17 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89;
|
public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89;
|
||||||
public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90;
|
public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90;
|
||||||
|
|
||||||
protected final GATKSAMRecord read;
|
protected final GATKSAMRecord read; // the read this base belongs to
|
||||||
protected final int offset;
|
protected final int offset; // the offset in the bases array for this base
|
||||||
protected final boolean isDeletion;
|
protected final boolean isDeletion; // is this base a deletion
|
||||||
protected final boolean isBeforeDeletion;
|
protected final boolean isBeforeDeletion; // is the base to the right of this base an deletion
|
||||||
protected final boolean isBeforeInsertion;
|
protected final boolean isAfterDeletion; // is the base to the left of this base a deletion
|
||||||
protected final boolean isNextToSoftClip;
|
protected final boolean isBeforeInsertion; // is the base to the right of this base an insertion
|
||||||
protected final int eventLength;
|
protected final boolean isAfterInsertion; // is the base to the left of this base an insertion
|
||||||
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases
|
protected final boolean isNextToSoftClip; // is this base either before or after a soft clipped base
|
||||||
// in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base
|
||||||
|
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -39,7 +41,9 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
* @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions)
|
* @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions)
|
||||||
* @param isDeletion whether or not this base is a deletion
|
* @param isDeletion whether or not this base is a deletion
|
||||||
* @param isBeforeDeletion whether or not this base is before a deletion
|
* @param isBeforeDeletion whether or not this base is before a deletion
|
||||||
|
* @param isAfterDeletion whether or not this base is after a deletion
|
||||||
* @param isBeforeInsertion whether or not this base is before an insertion
|
* @param isBeforeInsertion whether or not this base is before an insertion
|
||||||
|
* @param isAfterInsertion whether or not this base is after an insertion
|
||||||
* @param isNextToSoftClip whether or not this base is next to a soft clipped base
|
* @param isNextToSoftClip whether or not this base is next to a soft clipped base
|
||||||
* @param nextEventBases bases in event in case element comes before insertion or deletion
|
* @param nextEventBases bases in event in case element comes before insertion or deletion
|
||||||
* @param nextEventLength length of next event in case it's insertion or deletion
|
* @param nextEventLength length of next event in case it's insertion or deletion
|
||||||
|
|
@ -48,8 +52,7 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
"read != null",
|
"read != null",
|
||||||
"offset >= -1",
|
"offset >= -1",
|
||||||
"offset <= read.getReadLength()"})
|
"offset <= read.getReadLength()"})
|
||||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip,
|
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength) {
|
||||||
final String nextEventBases, final int nextEventLength) {
|
|
||||||
if (offset < 0 && isDeletion)
|
if (offset < 0 && isDeletion)
|
||||||
throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset");
|
throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset");
|
||||||
|
|
||||||
|
|
@ -57,20 +60,22 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
this.isDeletion = isDeletion;
|
this.isDeletion = isDeletion;
|
||||||
this.isBeforeDeletion = isBeforeDeletion;
|
this.isBeforeDeletion = isBeforeDeletion;
|
||||||
|
this.isAfterDeletion = isAfterDeletion;
|
||||||
this.isBeforeInsertion = isBeforeInsertion;
|
this.isBeforeInsertion = isBeforeInsertion;
|
||||||
|
this.isAfterInsertion = isAfterInsertion;
|
||||||
this.isNextToSoftClip = isNextToSoftClip;
|
this.isNextToSoftClip = isNextToSoftClip;
|
||||||
if (isBeforeInsertion)
|
if (isBeforeInsertion)
|
||||||
eventBases = nextEventBases;
|
eventBases = nextEventBases;
|
||||||
else
|
else
|
||||||
eventBases = null; // ignore argument in any other case
|
eventBases = null; // ignore argument in any other case
|
||||||
if (isBeforeDeletion || isBeforeInsertion)
|
if (isBeforeDeletion || isBeforeInsertion)
|
||||||
eventLength = nextEventLength;
|
eventLength = nextEventLength;
|
||||||
else
|
else
|
||||||
eventLength = -1;
|
eventLength = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isBeforeInsertion, final boolean isNextToSoftClip) {
|
public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) {
|
||||||
this(read,offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null, -1);
|
this(read,offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, -1);
|
||||||
}
|
}
|
||||||
public boolean isDeletion() {
|
public boolean isDeletion() {
|
||||||
return isDeletion;
|
return isDeletion;
|
||||||
|
|
@ -80,10 +85,18 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
return isBeforeDeletion;
|
return isBeforeDeletion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isAfterDeletion() {
|
||||||
|
return isAfterDeletion;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isBeforeInsertion() {
|
public boolean isBeforeInsertion() {
|
||||||
return isBeforeInsertion;
|
return isBeforeInsertion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isAfterInsertion() {
|
||||||
|
return isAfterInsertion;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isNextToSoftClip() {
|
public boolean isNextToSoftClip() {
|
||||||
return isNextToSoftClip;
|
return isNextToSoftClip;
|
||||||
}
|
}
|
||||||
|
|
@ -123,14 +136,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns length of the event (number of inserted or deleted bases
|
* @return length of the event (number of inserted or deleted bases
|
||||||
*/
|
*/
|
||||||
public int getEventLength() {
|
public int getEventLength() {
|
||||||
return eventLength;
|
return eventLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
|
* @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
|
||||||
*/
|
*/
|
||||||
public String getEventBases() {
|
public String getEventBases() {
|
||||||
return eventBases;
|
return eventBases;
|
||||||
|
|
@ -185,13 +198,9 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
//
|
//
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
// public boolean isReducedRead() {
|
|
||||||
// return read.isReducedRead();
|
|
||||||
// }
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the number of elements in the pileup element.
|
* Returns the number of elements in the pileup element.
|
||||||
* <p/>
|
*
|
||||||
* Unless this is a reduced read, the number of elements in a pileup element is one. In the event of
|
* Unless this is a reduced read, the number of elements in a pileup element is one. In the event of
|
||||||
* this being a reduced read and a deletion, we return the average number of elements between the left
|
* this being a reduced read and a deletion, we return the average number of elements between the left
|
||||||
* and right elements to the deletion. We assume the deletion to be left aligned.
|
* and right elements to the deletion. We assume the deletion to be left aligned.
|
||||||
|
|
|
||||||
|
|
@ -96,12 +96,11 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup<
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion, boolean isNextToSoftClip) {
|
protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) {
|
||||||
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
||||||
}
|
}
|
||||||
@Override
|
@Override
|
||||||
protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion,
|
protected ExtendedEventPileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ) {
|
||||||
boolean isNextToSoftClip,String nextEventBases, int nextEventLength) {
|
|
||||||
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
throw new UnsupportedOperationException("Not enough information provided to create a new pileup element");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,9 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup<ReadBackedPil
|
||||||
*
|
*
|
||||||
* @param loc
|
* @param loc
|
||||||
* @param pileup
|
* @param pileup
|
||||||
|
* @param size
|
||||||
|
* @param nDeletions
|
||||||
|
* @param nMQ0Reads
|
||||||
*/
|
*/
|
||||||
public ReadBackedPileupImpl(GenomeLoc loc, List<PileupElement> pileup, int size, int nDeletions, int nMQ0Reads) {
|
public ReadBackedPileupImpl(GenomeLoc loc, List<PileupElement> pileup, int size, int nDeletions, int nMQ0Reads) {
|
||||||
super(loc, pileup, size, nDeletions, nMQ0Reads);
|
super(loc, pileup, size, nDeletions, nMQ0Reads);
|
||||||
|
|
@ -71,13 +74,14 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup<ReadBackedPil
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion,
|
protected PileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip) {
|
||||||
boolean isNextToSoftClip) {
|
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, 0);
|
||||||
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, null,0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeDeletion, boolean isBeforeInsertion,
|
@Override
|
||||||
boolean isNextToSoftClip,String nextEventBases, final int nextEventLength) {
|
protected PileupElement createNewPileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeDeletion, final boolean isAfterDeletion, final boolean isBeforeInsertion, final boolean isAfterInsertion, final boolean isNextToSoftClip, final String nextEventBases, final int nextEventLength ) {
|
||||||
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isBeforeInsertion, isNextToSoftClip, nextEventBases,nextEventLength);
|
return new PileupElement(read, offset, isDeletion, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, nextEventBases, nextEventLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,10 +26,11 @@
|
||||||
package org.broadinstitute.sting.utils.recalibration;
|
package org.broadinstitute.sting.utils.recalibration;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
|
import org.broadinstitute.sting.gatk.walkers.bqsr.*;
|
||||||
|
import org.broadinstitute.sting.utils.BitSetUtils;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
|
@ -37,244 +38,334 @@ import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.BitSet;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility methods to facilitate on-the-fly base quality score recalibration.
|
* Utility methods to facilitate on-the-fly base quality score recalibration.
|
||||||
*
|
*
|
||||||
* User: rpoplin
|
* User: rpoplin
|
||||||
* Date: 2/4/12
|
* Date: 2/4/12
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class BaseRecalibration {
|
public class BaseRecalibration {
|
||||||
|
|
||||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
private ArrayList<HashMap<BitSet, RecalDatum>> collapsedHashes = new ArrayList<HashMap<BitSet, RecalDatum>> (); // All the collapsed data tables
|
||||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
|
||||||
public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
|
||||||
public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
|
||||||
public static final String EOF_MARKER = "EOF";
|
|
||||||
private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here?
|
|
||||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(...) for all sets of covariate values.
|
|
||||||
|
|
||||||
public BaseRecalibration( final File RECAL_FILE ) {
|
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of all covariates to be used in this calculation
|
||||||
|
private final ArrayList<Covariate> requiredCovariates = new ArrayList<Covariate>(); // List of required covariates to be used in this calculation
|
||||||
|
private final ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>(); // List of optional covariates to be used in this calculation
|
||||||
|
|
||||||
|
public static final Pattern REQUIRED_COVARIATE_PATTERN = Pattern.compile("^# Required Covariates.*");
|
||||||
|
public static final Pattern OPTIONAL_COVARIATE_PATTERN = Pattern.compile("^# Optional Covariates.*");
|
||||||
|
public static final String EOF_MARKER = "EOF";
|
||||||
|
|
||||||
|
private static final byte SMOOTHING_CONSTANT = 1;
|
||||||
|
|
||||||
|
ArrayList<BQSRKeyManager> keyManagers = new ArrayList<BQSRKeyManager>();
|
||||||
|
|
||||||
|
public BaseRecalibration(final File RECAL_FILE) {
|
||||||
// Get a list of all available covariates
|
// Get a list of all available covariates
|
||||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||||
|
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // todo -- initialize with the parameters from the csv file!
|
||||||
|
|
||||||
int lineNumber = 0;
|
int lineNumber = 0;
|
||||||
boolean foundAllCovariates = false;
|
|
||||||
|
boolean foundRequiredCovariates = false;
|
||||||
|
boolean foundOptionalCovariates = false;
|
||||||
|
boolean initializedKeyManagers = false;
|
||||||
|
|
||||||
// Read in the data from the csv file and populate the data map and covariates list
|
// Read in the data from the csv file and populate the data map and covariates list
|
||||||
boolean sawEOF = false;
|
boolean sawEOF = false;
|
||||||
try {
|
try {
|
||||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
for (String line : new XReadLines(RECAL_FILE)) {
|
||||||
lineNumber++;
|
lineNumber++;
|
||||||
if ( EOF_MARKER.equals(line) ) {
|
|
||||||
sawEOF = true;
|
|
||||||
} else if( COMMENT_PATTERN.matcher(line).matches() ) {
|
|
||||||
; // Skip over the comment lines, (which start with '#')
|
|
||||||
}
|
|
||||||
// Read in the covariates that were used from the input file
|
|
||||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
|
||||||
if( foundAllCovariates ) {
|
|
||||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
|
||||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
|
||||||
String[] vals = line.split(",");
|
|
||||||
for( int iii = 0; iii < vals.length - 4; iii++ ) { // There are n-4 covariates. The last four items are ErrorModel, nObservations, nMismatch, and Qempirical
|
|
||||||
boolean foundClass = false;
|
|
||||||
for( Class<?> covClass : classes ) {
|
|
||||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
|
||||||
foundClass = true;
|
|
||||||
try {
|
|
||||||
Covariate covariate = (Covariate)covClass.newInstance();
|
|
||||||
requestedCovariates.add( covariate );
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new DynamicClassResolutionException(covClass, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
sawEOF = EOF_MARKER.equals(line);
|
||||||
|
if (sawEOF)
|
||||||
|
break;
|
||||||
|
|
||||||
|
boolean requiredCovariatesLine = REQUIRED_COVARIATE_PATTERN.matcher(line).matches();
|
||||||
|
boolean optionalCovariatesLine = OPTIONAL_COVARIATE_PATTERN.matcher(line).matches();
|
||||||
|
|
||||||
|
if (requiredCovariatesLine && foundRequiredCovariates)
|
||||||
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate required covariates line");
|
||||||
|
|
||||||
|
if (optionalCovariatesLine && foundOptionalCovariates)
|
||||||
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Duplicate optional covariates line");
|
||||||
|
|
||||||
|
if (optionalCovariatesLine && !foundRequiredCovariates)
|
||||||
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Optional covariates reported before Required covariates");
|
||||||
|
|
||||||
|
if (requiredCovariatesLine || optionalCovariatesLine) {
|
||||||
|
String [] covariateNames = line.split(": ")[1].split(","); // take the second half of the string (past the ":") and split it by "," to get the list of required covariates
|
||||||
|
|
||||||
|
List<Covariate> covariateList = requiredCovariatesLine ? requiredCovariates : optionalCovariates; // set the appropriate covariate list to update
|
||||||
|
|
||||||
|
for (String covariateName : covariateNames) {
|
||||||
|
boolean foundClass = false;
|
||||||
|
for (Class<?> covClass : classes) {
|
||||||
|
if ((covariateName + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
|
||||||
|
foundClass = true;
|
||||||
|
try {
|
||||||
|
Covariate covariate = (Covariate) covClass.newInstance();
|
||||||
|
covariate.initialize(RAC);
|
||||||
|
requestedCovariates.add(covariate);
|
||||||
|
covariateList.add(covariate);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new DynamicClassResolutionException(covClass, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if( !foundClass ) {
|
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (!foundClass)
|
||||||
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (covariateName + "Covariate") + ") isn't a valid covariate option.");
|
||||||
}
|
}
|
||||||
|
foundRequiredCovariates = foundRequiredCovariates || requiredCovariatesLine;
|
||||||
|
foundOptionalCovariates = foundOptionalCovariates || optionalCovariatesLine;
|
||||||
|
}
|
||||||
|
|
||||||
} else { // Found a line of data
|
else if (!line.startsWith("#")) { // if this is not a comment line that we don't care about, it is DATA!
|
||||||
if( !foundAllCovariates ) {
|
if (!foundRequiredCovariates || !foundOptionalCovariates) // At this point all the covariates should have been found and initialized
|
||||||
foundAllCovariates = true;
|
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
|
||||||
|
|
||||||
// At this point all the covariates should have been found and initialized
|
if (!initializedKeyManagers) {
|
||||||
if( requestedCovariates.size() < 2 ) {
|
ArrayList<Covariate> emptyList = new ArrayList<Covariate>(0);
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
ArrayList<Covariate> requiredCovariatesUpToThis = new ArrayList<Covariate>(); // Initialize one key manager for each table of required covariate
|
||||||
|
for (Covariate covariate : requiredCovariates) { // Every required covariate table includes all preceding required covariates (e.g. RG ; RG,Q )
|
||||||
|
requiredCovariatesUpToThis.add(covariate);
|
||||||
|
keyManagers.add(new BQSRKeyManager(requiredCovariatesUpToThis, emptyList));
|
||||||
}
|
}
|
||||||
|
keyManagers.add(new BQSRKeyManager(requiredCovariates, optionalCovariates)); // One master key manager for the collapsed tables
|
||||||
final boolean createCollapsedTables = true;
|
|
||||||
|
initializedKeyManagers = true;
|
||||||
// Initialize any covariate member variables using the shared argument collection
|
|
||||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
|
||||||
for( Covariate cov : requestedCovariates ) {
|
|
||||||
cov.initialize( RAC );
|
|
||||||
}
|
|
||||||
// Initialize the data hashMaps
|
|
||||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
|
||||||
|
|
||||||
}
|
}
|
||||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch ( FileNotFoundException e ) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||||
} catch ( NumberFormatException e ) {
|
} catch (NumberFormatException e) {
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( !sawEOF ) {
|
if (!sawEOF) {
|
||||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( dataManager == null ) {
|
generateEmpiricalQualities(SMOOTHING_CONSTANT);
|
||||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
|
||||||
}
|
|
||||||
|
|
||||||
dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||||
|
*
|
||||||
|
* @param file The CSV file we read the line from (for exception throwing purposes)
|
||||||
* @param line A line of CSV data read from the recalibration table data file
|
* @param line A line of CSV data read from the recalibration table data file
|
||||||
*/
|
*/
|
||||||
private void addCSVData(final File file, final String line) {
|
private void addCSVData(final File file, final String line) {
|
||||||
final String[] vals = line.split(",");
|
final String[] vals = line.split(",");
|
||||||
|
boolean hasOptionalCovariates = optionalCovariates.size() > 0; // Do we have optional covariates in this key?
|
||||||
|
int addOptionalCovariates = hasOptionalCovariates ? 2 : 0; // If we have optional covariates at all, add two to the size of the array (to acommodate the covariate and the id)
|
||||||
|
final Object[] key = new Object[requiredCovariates.size() + addOptionalCovariates + 1]; // Reserve enough space for the required covariates, optional covariate, id and eventType
|
||||||
|
|
||||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
int indexCovariateValue = key.length - 3; // In the order of keys, the optional covariate comes right after the required covariates
|
||||||
if( vals.length != requestedCovariates.size() + 4 ) { // +4 because of ErrorModel, nObservations, nMismatch, and Qempirical
|
int indexCovariateID = key.length - 2; // followed by the covariate ID
|
||||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
int indexEventType = key.length - 1; // and the event type
|
||||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
|
||||||
|
addKeysToArray(key, vals, requiredCovariates, 0); // Add the required covariates keys
|
||||||
|
|
||||||
|
if (hasOptionalCovariates) {
|
||||||
|
key[indexCovariateID] = Short.parseShort(vals[indexCovariateID]); // Add the optional covariate ID
|
||||||
|
Covariate covariate = optionalCovariates.get((Short) key[indexCovariateID]); // Get the covariate object for this ID
|
||||||
|
key[indexCovariateValue] = covariate.getValue(vals[indexCovariateValue]); // Add the optional covariate value, given the ID
|
||||||
}
|
}
|
||||||
|
key[indexEventType] = EventType.eventFrom(vals[indexEventType]); // Add the event type
|
||||||
|
|
||||||
final Object[] key = new Object[requestedCovariates.size()];
|
int datumIndex = key.length; // The recal datum starts at the end of the key (after the event type)
|
||||||
Covariate cov;
|
long count = Long.parseLong(vals[datumIndex]); // Number of observations
|
||||||
int iii;
|
long errors = Long.parseLong(vals[datumIndex + 1]); // Number of errors observed
|
||||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
double reportedQual = Double.parseDouble(vals[1]); // The reported Q score --> todo -- I don't like having the Q score hard coded in vals[1]. Generalize it!
|
||||||
cov = requestedCovariates.get( iii );
|
final RecalDatum datum = new RecalDatum(count, errors, reportedQual, 0.0); // Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||||
key[iii] = cov.getValue( vals[iii] );
|
|
||||||
}
|
|
||||||
final String modelString = vals[iii++];
|
|
||||||
final RecalDataManager.BaseRecalibrationType errorModel = CovariateKeySet.getErrorModelFromString(modelString);
|
|
||||||
|
|
||||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
addToAllTables(key, datum); // Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
|
||||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
|
||||||
|
|
||||||
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE, errorModel ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void recalibrateRead( final GATKSAMRecord read ) {
|
/**
|
||||||
|
* Add the given mapping to all of the collapsed hash tables
|
||||||
|
*
|
||||||
|
* @param key The list of comparables that is the key for this mapping
|
||||||
|
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||||
|
*/
|
||||||
|
private void addToAllTables(final Object[] key, final RecalDatum fullDatum) {
|
||||||
|
int nHashes = requiredCovariates.size(); // We will always need one hash per required covariate
|
||||||
|
if (optionalCovariates.size() > 0) // If we do have optional covariates
|
||||||
|
nHashes += 1; // we will need one extra hash table with the optional covariate encoded in the key set on top of the required covariates
|
||||||
|
|
||||||
|
|
||||||
|
for (int hashIndex = 0; hashIndex < nHashes; hashIndex++) {
|
||||||
|
HashMap<BitSet, RecalDatum> table; // object to hold the hash table we are going to manipulate
|
||||||
|
if (hashIndex >= collapsedHashes.size()) { // if we haven't yet created the collapsed hash table for this index, create it now!
|
||||||
|
table = new HashMap<BitSet, RecalDatum>();
|
||||||
|
collapsedHashes.add(table); // Because this is the only place where we add tables to the ArrayList, they will always be in the order we want.
|
||||||
|
}
|
||||||
|
else
|
||||||
|
table = collapsedHashes.get(hashIndex); // if the table has been previously created, just assign it to the "table" object for manipulation
|
||||||
|
|
||||||
|
int copyTo = hashIndex + 1; // this will copy the covariates up to the index of the one we are including now (1 for RG, 2 for QS,...)
|
||||||
|
if (copyTo > requiredCovariates.size()) // only in the case where we have optional covariates we need to increase the size of the array
|
||||||
|
copyTo = requiredCovariates.size() + 2; // if we have optional covarites, add the optional covariate and it's id to the size of the key
|
||||||
|
Object[] tableKey = new Object[copyTo + 1]; // create a new array that will hold as many keys as hashIndex (1 for RG hash, 2 for QualityScore hash, 3 for covariate hash plus the event type
|
||||||
|
System.arraycopy(key, 0, tableKey, 0, copyTo); // copy the keys for the corresponding covariates into the tableKey.
|
||||||
|
tableKey[tableKey.length-1] = key[key.length - 1]; // add the event type. The event type is always the last key, on both key sets.
|
||||||
|
|
||||||
|
BitSet hashKey = keyManagers.get(hashIndex).bitSetFromKey(tableKey); // Add bitset key with fullDatum to the appropriate hash
|
||||||
|
RecalDatum datum = table.get(hashKey);
|
||||||
|
if (datum == null)
|
||||||
|
datum = fullDatum;
|
||||||
|
else if (hashIndex == 0) // Special case for the ReadGroup covariate
|
||||||
|
datum.combine(fullDatum);
|
||||||
|
else
|
||||||
|
datum.increment(fullDatum);
|
||||||
|
table.put(hashKey, datum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||||
|
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||||
|
*
|
||||||
|
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||||
|
*/
|
||||||
|
private void generateEmpiricalQualities(final int smoothing) {
|
||||||
|
for (final HashMap<BitSet, RecalDatum> table : collapsedHashes)
|
||||||
|
for (final RecalDatum datum : table.values())
|
||||||
|
datum.calcCombinedEmpiricalQuality(smoothing, QualityUtils.MAX_QUAL_SCORE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public void recalibrateRead(final GATKSAMRecord read) {
|
||||||
//compute all covariate values for this read
|
//compute all covariate values for this read
|
||||||
RecalDataManager.computeCovariates(read, requestedCovariates);
|
RecalDataManager.computeCovariates(read, requestedCovariates);
|
||||||
final CovariateKeySet covariateKeySet = RecalDataManager.getAllCovariateValuesFor( read );
|
final ReadCovariates readCovariates = RecalDataManager.covariateKeySetFrom(read);
|
||||||
|
|
||||||
for( final RecalDataManager.BaseRecalibrationType errorModel : RecalDataManager.BaseRecalibrationType.values() ) {
|
for (final EventType errorModel : EventType.values()) {
|
||||||
final byte[] originalQuals = read.getBaseQualities( errorModel );
|
final byte[] originalQuals = read.getBaseQualities(errorModel);
|
||||||
final byte[] recalQuals = originalQuals.clone();
|
final byte[] recalQuals = originalQuals.clone();
|
||||||
|
|
||||||
// For each base in the read
|
// For each base in the read
|
||||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
for (int offset = 0; offset < read.getReadLength(); offset++) {
|
||||||
|
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel);
|
||||||
final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel);
|
final byte qualityScore = performSequentialQualityCalculation(keySet, errorModel);
|
||||||
final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates
|
|
||||||
|
|
||||||
// BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables?
|
|
||||||
//Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode);
|
|
||||||
//if( qualityScore == null ) {
|
|
||||||
final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey );
|
|
||||||
// qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode);
|
|
||||||
//}
|
|
||||||
|
|
||||||
recalQuals[offset] = qualityScore;
|
recalQuals[offset] = qualityScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
|
||||||
read.setBaseQualities( recalQuals, errorModel );
|
read.setBaseQualities(recalQuals, errorModel);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implements a serial recalibration of the reads using the combinational table.
|
* Implements a serial recalibration of the reads using the combinational table.
|
||||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||||
*
|
*
|
||||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||||
*
|
*
|
||||||
* - calculate the global quality score shift across all data [DeltaQ]
|
* - calculate the global quality score shift across all data [DeltaQ]
|
||||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||||
* - The final shift equation is:
|
* - The final shift equation is:
|
||||||
|
*
|
||||||
|
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||||
|
*
|
||||||
|
* todo -- I extremely dislike the way all this math is hardcoded... should rethink the data structures for this method in particular.
|
||||||
*
|
*
|
||||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
|
||||||
* @param key The list of Comparables that were calculated from the covariates
|
* @param key The list of Comparables that were calculated from the covariates
|
||||||
|
* @param errorModel the event type
|
||||||
* @return A recalibrated quality score as a byte
|
* @return A recalibrated quality score as a byte
|
||||||
*/
|
*/
|
||||||
private byte performSequentialQualityCalculation( final RecalDataManager.BaseRecalibrationType errorModel, final Object... key ) {
|
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
|
||||||
|
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
|
||||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
|
||||||
final Object[] readGroupCollapsedKey = new Object[1];
|
final int readGroupKeyIndex = 0;
|
||||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
final int qualKeyIndex = 1;
|
||||||
final Object[] covariateCollapsedKey = new Object[3];
|
final int covariatesKeyIndex = 2;
|
||||||
|
|
||||||
// The global quality shift (over the read group only)
|
// The global quality shift (over the read group only)
|
||||||
readGroupCollapsedKey[0] = key[0];
|
List<BitSet> bitKeys = keyManagers.get(readGroupKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0, errorModel).get( readGroupCollapsedKey ));
|
if (bitKeys.size() > 1)
|
||||||
|
throw new ReviewedStingException("There should only be one key for the RG collapsed table, something went wrong here");
|
||||||
|
|
||||||
|
final RecalDatum globalRecalDatum = collapsedHashes.get(readGroupKeyIndex).get(bitKeys.get(0));
|
||||||
double globalDeltaQ = 0.0;
|
double globalDeltaQ = 0.0;
|
||||||
if( globalRecalDatum != null ) {
|
if (globalRecalDatum != null) {
|
||||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The shift in quality between reported and empirical
|
// The shift in quality between reported and empirical
|
||||||
qualityScoreCollapsedKey[0] = key[0];
|
bitKeys = keyManagers.get(qualKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||||
qualityScoreCollapsedKey[1] = key[1];
|
if (bitKeys.size() > 1)
|
||||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1, errorModel).get( qualityScoreCollapsedKey ));
|
throw new ReviewedStingException("There should only be one key for the Qual collapsed table, something went wrong here");
|
||||||
|
|
||||||
|
final RecalDatum qReportedRecalDatum = collapsedHashes.get(qualKeyIndex).get(bitKeys.get(0));
|
||||||
double deltaQReported = 0.0;
|
double deltaQReported = 0.0;
|
||||||
if( qReportedRecalDatum != null ) {
|
if (qReportedRecalDatum != null) {
|
||||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The shift in quality due to each covariate by itself in turn
|
// The shift in quality due to each covariate by itself in turn
|
||||||
|
bitKeys = keyManagers.get(covariatesKeyIndex).bitSetsFromAllKeys(key, errorModel);
|
||||||
double deltaQCovariates = 0.0;
|
double deltaQCovariates = 0.0;
|
||||||
double deltaQCovariateEmpirical;
|
double deltaQCovariateEmpirical;
|
||||||
covariateCollapsedKey[0] = key[0];
|
for (BitSet k : bitKeys) {
|
||||||
covariateCollapsedKey[1] = key[1];
|
final RecalDatum covariateRecalDatum = collapsedHashes.get(covariatesKeyIndex).get(k);
|
||||||
for( int iii = 2; iii < key.length; iii++ ) {
|
if (covariateRecalDatum != null) {
|
||||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
|
||||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii, errorModel).get( covariateCollapsedKey ));
|
|
||||||
if( covariateRecalDatum != null ) {
|
|
||||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
return QualityUtils.boundQual((int) Math.round(newQuality), QualityUtils.MAX_QUAL_SCORE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||||
|
*
|
||||||
* @param originalQuals The list of original base quality scores
|
* @param originalQuals The list of original base quality scores
|
||||||
* @param recalQuals A list of the new recalibrated quality scores
|
* @param recalQuals A list of the new recalibrated quality scores
|
||||||
*/
|
*/
|
||||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
|
||||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
for (int iii = 0; iii < recalQuals.length; iii++) {
|
||||||
if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
if (originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||||
recalQuals[iii] = originalQuals[iii];
|
recalQuals[iii] = originalQuals[iii];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared functionality to add keys
|
||||||
|
*
|
||||||
|
* @param array the target array we are creating the keys in
|
||||||
|
* @param keys the actual keys we're using as a source
|
||||||
|
* @param covariateList the covariate list to loop through
|
||||||
|
* @param keyIndex the index in the keys and the arrays objects to run from
|
||||||
|
*/
|
||||||
|
private void addKeysToArray(final Object[] array, final String[] keys, List<Covariate> covariateList, int keyIndex) {
|
||||||
|
for (Covariate covariate : covariateList) {
|
||||||
|
array[keyIndex] = covariate.getValue(keys[keyIndex]);
|
||||||
|
keyIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.sam;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
|
@ -233,7 +234,17 @@ public class ArtificialSAMUtils {
|
||||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static GATKSAMRecord createArtificialRead(Cigar cigar) {
|
||||||
|
int length = cigar.getReadLength();
|
||||||
|
byte [] base = {'A'};
|
||||||
|
byte [] qual = {30};
|
||||||
|
byte [] bases = Utils.arrayFromArrayWithLength(base, length);
|
||||||
|
byte [] quals = Utils.arrayFromArrayWithLength(qual, length);
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||||
|
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
||||||
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
||||||
GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
|
GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
|
||||||
|
|
@ -361,10 +372,10 @@ public class ArtificialSAMUtils {
|
||||||
final GATKSAMRecord left = pair.get(0);
|
final GATKSAMRecord left = pair.get(0);
|
||||||
final GATKSAMRecord right = pair.get(1);
|
final GATKSAMRecord right = pair.get(1);
|
||||||
|
|
||||||
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false));
|
pileupElements.add(new PileupElement(left, pos - leftStart, false, false, false, false, false, false));
|
||||||
|
|
||||||
if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) {
|
if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) {
|
||||||
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false));
|
pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false, false, false, false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@
|
||||||
package org.broadinstitute.sting.utils.sam;
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
|
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
|
@ -165,7 +165,7 @@ public class GATKSAMRecord extends BAMRecord {
|
||||||
/**
|
/**
|
||||||
* Setters and Accessors for base insertion and base deletion quality scores
|
* Setters and Accessors for base insertion and base deletion quality scores
|
||||||
*/
|
*/
|
||||||
public void setBaseQualities( final byte[] quals, final RecalDataManager.BaseRecalibrationType errorModel ) {
|
public void setBaseQualities( final byte[] quals, final EventType errorModel ) {
|
||||||
switch( errorModel ) {
|
switch( errorModel ) {
|
||||||
case BASE_SUBSTITUTION:
|
case BASE_SUBSTITUTION:
|
||||||
setBaseQualities(quals);
|
setBaseQualities(quals);
|
||||||
|
|
@ -181,7 +181,7 @@ public class GATKSAMRecord extends BAMRecord {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] getBaseQualities( final RecalDataManager.BaseRecalibrationType errorModel ) {
|
public byte[] getBaseQualities( final EventType errorModel ) {
|
||||||
switch( errorModel ) {
|
switch( errorModel ) {
|
||||||
case BASE_SUBSTITUTION:
|
case BASE_SUBSTITUTION:
|
||||||
return getBaseQualities();
|
return getBaseQualities();
|
||||||
|
|
@ -204,7 +204,7 @@ public class GATKSAMRecord extends BAMRecord {
|
||||||
quals = new byte[getBaseQualities().length];
|
quals = new byte[getBaseQualities().length];
|
||||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||||
setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION);
|
setBaseQualities(quals, EventType.BASE_INSERTION);
|
||||||
}
|
}
|
||||||
return quals;
|
return quals;
|
||||||
}
|
}
|
||||||
|
|
@ -213,9 +213,9 @@ public class GATKSAMRecord extends BAMRecord {
|
||||||
byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) );
|
byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) );
|
||||||
if( quals == null ) {
|
if( quals == null ) {
|
||||||
quals = new byte[getBaseQualities().length];
|
quals = new byte[getBaseQualities().length];
|
||||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||||
setBaseQualities(quals, RecalDataManager.BaseRecalibrationType.BASE_DELETION);
|
setBaseQualities(quals, EventType.BASE_DELETION);
|
||||||
}
|
}
|
||||||
return quals;
|
return quals;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam;
|
||||||
import com.google.java.contract.Ensures;
|
import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
|
@ -495,7 +496,7 @@ public class ReadUtils {
|
||||||
/**
|
/**
|
||||||
* Is a base inside a read?
|
* Is a base inside a read?
|
||||||
*
|
*
|
||||||
* @param read the read to evaluate
|
* @param read the read to evaluate
|
||||||
* @param referenceCoordinate the reference coordinate of the base to test
|
* @param referenceCoordinate the reference coordinate of the base to test
|
||||||
* @return true if it is inside the read, false otherwise.
|
* @return true if it is inside the read, false otherwise.
|
||||||
*/
|
*/
|
||||||
|
|
@ -541,9 +542,9 @@ public class ReadUtils {
|
||||||
*
|
*
|
||||||
* See getCoverageDistributionOfRead for information on how the coverage is calculated.
|
* See getCoverageDistributionOfRead for information on how the coverage is calculated.
|
||||||
*
|
*
|
||||||
* @param list the list of reads covering the region
|
* @param list the list of reads covering the region
|
||||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||||
* @return an array with the coverage of each position from startLocation to stopLocation
|
* @return an array with the coverage of each position from startLocation to stopLocation
|
||||||
*/
|
*/
|
||||||
public static int [] getCoverageDistributionOfReads(List<GATKSAMRecord> list, int startLocation, int stopLocation) {
|
public static int [] getCoverageDistributionOfReads(List<GATKSAMRecord> list, int startLocation, int stopLocation) {
|
||||||
|
|
@ -563,9 +564,9 @@ public class ReadUtils {
|
||||||
* Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
|
* Note: This function counts DELETIONS as coverage (since the main purpose is to downsample
|
||||||
* reads for variant regions, and deletions count as variants)
|
* reads for variant regions, and deletions count as variants)
|
||||||
*
|
*
|
||||||
* @param read the read to get the coverage distribution of
|
* @param read the read to get the coverage distribution of
|
||||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||||
* @return an array with the coverage of each position from startLocation to stopLocation
|
* @return an array with the coverage of each position from startLocation to stopLocation
|
||||||
*/
|
*/
|
||||||
public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) {
|
public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) {
|
||||||
|
|
@ -611,9 +612,9 @@ public class ReadUtils {
|
||||||
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage.
|
* Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage.
|
||||||
* Example: Read => {true, true, false, ... false}
|
* Example: Read => {true, true, false, ... false}
|
||||||
*
|
*
|
||||||
* @param readList the list of reads to generate the association mappings
|
* @param readList the list of reads to generate the association mappings
|
||||||
* @param startLocation the first reference coordinate of the region (inclusive)
|
* @param startLocation the first reference coordinate of the region (inclusive)
|
||||||
* @param stopLocation the last reference coordinate of the region (inclusive)
|
* @param stopLocation the last reference coordinate of the region (inclusive)
|
||||||
* @return the two hashmaps described above
|
* @return the two hashmaps described above
|
||||||
*/
|
*/
|
||||||
public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>> , HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings (List<GATKSAMRecord> readList, int startLocation, int stopLocation) {
|
public static Pair<HashMap<Integer, HashSet<GATKSAMRecord>> , HashMap<GATKSAMRecord, Boolean[]>> getBothReadToLociMappings (List<GATKSAMRecord> readList, int startLocation, int stopLocation) {
|
||||||
|
|
@ -622,7 +623,6 @@ public class ReadUtils {
|
||||||
HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
|
HashMap<Integer, HashSet<GATKSAMRecord>> locusToReadMap = new HashMap<Integer, HashSet<GATKSAMRecord>>(2*(stopLocation - startLocation + 1), 0.5f);
|
||||||
HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);
|
HashMap<GATKSAMRecord, Boolean[]> readToLocusMap = new HashMap<GATKSAMRecord, Boolean[]>(2*readList.size(), 0.5f);
|
||||||
|
|
||||||
|
|
||||||
for (int i = startLocation; i <= stopLocation; i++)
|
for (int i = startLocation; i <= stopLocation; i++)
|
||||||
locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists
|
locusToReadMap.put(i, new HashSet<GATKSAMRecord>()); // Initialize the locusToRead map with empty lists
|
||||||
|
|
||||||
|
|
@ -631,7 +631,7 @@ public class ReadUtils {
|
||||||
|
|
||||||
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
|
int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation);
|
||||||
|
|
||||||
for (int i=0; i<readCoverage.length; i++) {
|
for (int i = 0; i < readCoverage.length; i++) {
|
||||||
int refLocation = i + startLocation;
|
int refLocation = i + startLocation;
|
||||||
if (readCoverage[i] > 0) {
|
if (readCoverage[i] > 0) {
|
||||||
// Update the hash for this locus
|
// Update the hash for this locus
|
||||||
|
|
@ -649,6 +649,55 @@ public class ReadUtils {
|
||||||
return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
|
return new Pair<HashMap<Integer, HashSet<GATKSAMRecord>>, HashMap<GATKSAMRecord, Boolean[]>>(locusToReadMap, readToLocusMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create random read qualities
|
||||||
|
*
|
||||||
|
* @param length the length of the read
|
||||||
|
* @return an array with randomized base qualities between 0 and 50
|
||||||
|
*/
|
||||||
|
public static byte[] createRandomReadQuals(int length) {
|
||||||
|
Random random = GenomeAnalysisEngine.getRandomGenerator();
|
||||||
|
byte[] quals = new byte[length];
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
quals[i] = (byte) random.nextInt(50);
|
||||||
|
return quals;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create random read qualities
|
||||||
|
*
|
||||||
|
* @param length the length of the read
|
||||||
|
* @param allowNs whether or not to allow N's in the read
|
||||||
|
* @return an array with randomized bases (A-N) with equal probability
|
||||||
|
*/
|
||||||
|
public static byte[] createRandomReadBases(int length, boolean allowNs) {
|
||||||
|
Random random = GenomeAnalysisEngine.getRandomGenerator();
|
||||||
|
int numberOfBases = allowNs ? 5 : 4;
|
||||||
|
byte[] bases = new byte[length];
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
switch (random.nextInt(numberOfBases)) {
|
||||||
|
case 0:
|
||||||
|
bases[i] = 'A';
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
bases[i] = 'C';
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
bases[i] = 'G';
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
bases[i] = 'T';
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
bases[i] = 'N';
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new ReviewedStingException("Something went wrong, this is just impossible");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bases;
|
||||||
|
}
|
||||||
|
|
||||||
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
|
public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) {
|
||||||
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
String[] sequenceRecordNames = new String[sequenceDictionary.size()];
|
||||||
int sequenceRecordIndex = 0;
|
int sequenceRecordIndex = 0;
|
||||||
|
|
@ -656,4 +705,5 @@ public class ReadUtils {
|
||||||
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName();
|
||||||
return Arrays.deepToString(sequenceRecordNames);
|
return Arrays.deepToString(sequenceRecordNames);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -25,13 +25,10 @@
|
||||||
package org.broadinstitute.sting.utils.variantcontext;
|
package org.broadinstitute.sting.utils.variantcontext;
|
||||||
|
|
||||||
import org.broad.tribble.TribbleException;
|
import org.broad.tribble.TribbleException;
|
||||||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
import org.jgrapht.util.MathUtil;
|
|
||||||
|
|
||||||
import java.util.EnumMap;
|
import java.util.EnumMap;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class GenotypeLikelihoods {
|
public class GenotypeLikelihoods {
|
||||||
public static final boolean CAP_PLS = false;
|
public static final boolean CAP_PLS = false;
|
||||||
|
|
@ -201,4 +198,118 @@ public class GenotypeLikelihoods {
|
||||||
|
|
||||||
return s.toString();
|
return s.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
|
||||||
|
//
|
||||||
|
// -------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index.
|
||||||
|
* Note that the reference allele is always index=0.
|
||||||
|
*/
|
||||||
|
public static class GenotypeLikelihoodsAllelePair {
|
||||||
|
public final int alleleIndex1, alleleIndex2;
|
||||||
|
|
||||||
|
public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) {
|
||||||
|
this.alleleIndex1 = alleleIndex1;
|
||||||
|
this.alleleIndex2 = alleleIndex2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
|
||||||
|
*/
|
||||||
|
private static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[]{ new GenotypeLikelihoodsAllelePair(0, 0) };
|
||||||
|
|
||||||
|
private static void calculatePLcache(final int minIndex) {
|
||||||
|
// how many alternate alleles do we need to calculate for?
|
||||||
|
int altAlleles = 0;
|
||||||
|
int numLikelihoods = 1;
|
||||||
|
while ( numLikelihoods <= minIndex ) {
|
||||||
|
altAlleles++;
|
||||||
|
numLikelihoods += altAlleles + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[numLikelihoods];
|
||||||
|
|
||||||
|
// for all possible combinations of 2 alleles
|
||||||
|
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
|
||||||
|
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
|
||||||
|
PLIndexToAlleleIndex[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// how many likelihoods are associated with the given number of alternate alleles?
|
||||||
|
public static int calculateNumLikelihoods(int numAltAlleles) {
|
||||||
|
int numLikelihoods = 1;
|
||||||
|
for ( int i = 1; i <= numAltAlleles; i++ )
|
||||||
|
numLikelihoods += i + 1;
|
||||||
|
return numLikelihoods;
|
||||||
|
}
|
||||||
|
|
||||||
|
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
|
||||||
|
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
|
||||||
|
// Assumes that allele1Index < allele2Index
|
||||||
|
public static int calculatePLindex(final int allele1Index, final int allele2Index) {
|
||||||
|
return (allele2Index * (allele2Index+1) / 2) + allele1Index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the allele index pair for the given PL
|
||||||
|
*
|
||||||
|
* @param PLindex the PL index
|
||||||
|
* @return the allele index pair
|
||||||
|
*/
|
||||||
|
public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) {
|
||||||
|
// make sure that we've cached enough data
|
||||||
|
if ( PLindex >= PLIndexToAlleleIndex.length )
|
||||||
|
calculatePLcache(PLindex);
|
||||||
|
|
||||||
|
return PLIndexToAlleleIndex[PLindex];
|
||||||
|
}
|
||||||
|
|
||||||
|
// An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles
|
||||||
|
protected static int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the allele index pair for the given PL using the deprecated PL ordering:
|
||||||
|
* AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD.
|
||||||
|
* Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated
|
||||||
|
* ordering and I know with certainty that external users have built code on top of it; changing it now would
|
||||||
|
* cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method.
|
||||||
|
* This method assumes at most 3 alternate alleles.
|
||||||
|
* TODO -- address this issue at the source by updating DiploidSNPGenotypeLikelihoods.
|
||||||
|
*
|
||||||
|
* @param PLindex the PL index
|
||||||
|
* @return the allele index pair
|
||||||
|
*/
|
||||||
|
public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) {
|
||||||
|
final int convertedIndex = PLindexConversion[PLindex];
|
||||||
|
|
||||||
|
// make sure that we've cached enough data
|
||||||
|
if ( convertedIndex >= PLIndexToAlleleIndex.length )
|
||||||
|
calculatePLcache(convertedIndex);
|
||||||
|
|
||||||
|
return PLIndexToAlleleIndex[convertedIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index.
|
||||||
|
*
|
||||||
|
* @param allele1Index the index in VariantContext.getAllele() of the first allele
|
||||||
|
* @param allele2Index the index in VariantContext.getAllele() of the second allele
|
||||||
|
* @return the PL indexes
|
||||||
|
*/
|
||||||
|
public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) {
|
||||||
|
|
||||||
|
final int[] indexes = new int[3];
|
||||||
|
indexes[0] = calculatePLindex(allele1Index, allele1Index);
|
||||||
|
indexes[1] = calculatePLindex(allele1Index, allele2Index);
|
||||||
|
indexes[2] = calculatePLindex(allele2Index, allele2Index);
|
||||||
|
return indexes;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -656,12 +656,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
||||||
return alleles.get(i+1);
|
return alleles.get(i+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param other VariantContext whose alleles to compare against
|
||||||
|
* @return true if this VariantContext has the same alleles (both ref and alts) as other,
|
||||||
|
* regardless of ordering. Otherwise returns false.
|
||||||
|
*/
|
||||||
|
public boolean hasSameAllelesAs ( final VariantContext other ) {
|
||||||
|
return hasSameAlternateAllelesAs(other) && other.getReference().equals(getReference(), false);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param other VariantContext whose alternate alleles to compare against
|
* @param other VariantContext whose alternate alleles to compare against
|
||||||
* @return true if this VariantContext has the same alternate alleles as other,
|
* @return true if this VariantContext has the same alternate alleles as other,
|
||||||
* regardless of ordering. Otherwise returns false.
|
* regardless of ordering. Otherwise returns false.
|
||||||
*/
|
*/
|
||||||
public boolean hasSameAlternateAllelesAs ( VariantContext other ) {
|
public boolean hasSameAlternateAllelesAs ( final VariantContext other ) {
|
||||||
List<Allele> thisAlternateAlleles = getAlternateAlleles();
|
List<Allele> thisAlternateAlleles = getAlternateAlleles();
|
||||||
List<Allele> otherAlternateAlleles = other.getAlternateAlleles();
|
List<Allele> otherAlternateAlleles = other.getAlternateAlleles();
|
||||||
|
|
||||||
|
|
@ -1246,40 +1255,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
||||||
return best;
|
return best;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int[] getGLIndecesOfAllele(Allele inputAllele) {
|
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
|
||||||
|
|
||||||
// TODO -- this information is cached statically by the UnifiedGenotyperEngine; pull it out into a common utils class for all to use
|
int index = 1;
|
||||||
|
for ( Allele allele : getAlternateAlleles() ) {
|
||||||
int[] idxVector = new int[3];
|
if ( allele.equals(targetAllele) )
|
||||||
int numAlleles = this.getAlleles().size();
|
|
||||||
|
|
||||||
int idxDiag = numAlleles;
|
|
||||||
int incr = numAlleles - 1;
|
|
||||||
int k=1;
|
|
||||||
for (Allele a: getAlternateAlleles()) {
|
|
||||||
// multi-allelic approximation, part 1: Ideally
|
|
||||||
// for each alt allele compute marginal (suboptimal) posteriors -
|
|
||||||
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
|
|
||||||
// as a row-wise upper triangular matrix of likelihoods.
|
|
||||||
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
|
|
||||||
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
|
|
||||||
|
|
||||||
int idxAA = 0;
|
|
||||||
int idxAB = k++;
|
|
||||||
// yy is always element on the diagonal.
|
|
||||||
// 2 alleles: BBelement 2
|
|
||||||
// 3 alleles: BB element 3. CC element 5
|
|
||||||
// 4 alleles:
|
|
||||||
int idxBB = idxDiag;
|
|
||||||
|
|
||||||
if (a.equals(inputAllele)) {
|
|
||||||
idxVector[0] = idxAA;
|
|
||||||
idxVector[1] = idxAB;
|
|
||||||
idxVector[2] = idxBB;
|
|
||||||
break;
|
break;
|
||||||
}
|
index++;
|
||||||
idxDiag += incr--;
|
|
||||||
}
|
}
|
||||||
return idxVector;
|
|
||||||
|
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ import com.google.java.contract.*;
|
||||||
import org.broad.tribble.Feature;
|
import org.broad.tribble.Feature;
|
||||||
import org.broad.tribble.TribbleException;
|
import org.broad.tribble.TribbleException;
|
||||||
import org.broad.tribble.util.ParsingUtils;
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
|
@ -344,6 +345,21 @@ public class VariantContextBuilder {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tells us that the resulting VariantContext should have the specified location
|
||||||
|
* @param loc
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Requires({"loc.getContig() != null", "loc.getStart() >= 0", "loc.getStop() >= 0"})
|
||||||
|
public VariantContextBuilder loc(final GenomeLoc loc) {
|
||||||
|
this.contig = loc.getContig();
|
||||||
|
this.start = loc.getStart();
|
||||||
|
this.stop = loc.getStop();
|
||||||
|
toValidate.add(VariantContext.Validation.ALLELES);
|
||||||
|
toValidate.add(VariantContext.Validation.REF_PADDING);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tells us that the resulting VariantContext should have the specified contig chr
|
* Tells us that the resulting VariantContext should have the specified contig chr
|
||||||
* @param contig
|
* @param contig
|
||||||
|
|
|
||||||
|
|
@ -458,7 +458,7 @@ public class VariantContextUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
|
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
|
||||||
* If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
||||||
* the sample name
|
* the sample name
|
||||||
*
|
*
|
||||||
* @param genomeLocParser loc parser
|
* @param genomeLocParser loc parser
|
||||||
|
|
@ -492,11 +492,11 @@ public class VariantContextUtils {
|
||||||
if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE )
|
if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE )
|
||||||
verifyUniqueSampleNames(unsortedVCs);
|
verifyUniqueSampleNames(unsortedVCs);
|
||||||
|
|
||||||
List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
|
final List<VariantContext> prepaddedVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
|
||||||
// Make sure all variant contexts are padded with reference base in case of indels if necessary
|
// Make sure all variant contexts are padded with reference base in case of indels if necessary
|
||||||
List<VariantContext> VCs = new ArrayList<VariantContext>();
|
final List<VariantContext> VCs = new ArrayList<VariantContext>();
|
||||||
|
|
||||||
for (VariantContext vc : prepaddedVCs) {
|
for (final VariantContext vc : prepaddedVCs) {
|
||||||
// also a reasonable place to remove filtered calls, if needed
|
// also a reasonable place to remove filtered calls, if needed
|
||||||
if ( ! filteredAreUncalled || vc.isNotFiltered() )
|
if ( ! filteredAreUncalled || vc.isNotFiltered() )
|
||||||
VCs.add(createVariantContextWithPaddedAlleles(vc, false));
|
VCs.add(createVariantContextWithPaddedAlleles(vc, false));
|
||||||
|
|
@ -531,7 +531,7 @@ public class VariantContextUtils {
|
||||||
|
|
||||||
// cycle through and add info from the other VCs, making sure the loc/reference matches
|
// cycle through and add info from the other VCs, making sure the loc/reference matches
|
||||||
|
|
||||||
for ( VariantContext vc : VCs ) {
|
for ( final VariantContext vc : VCs ) {
|
||||||
if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) )
|
if ( loc.getStart() != vc.getStart() ) // || !first.getReference().equals(vc.getReference()) )
|
||||||
throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
|
throw new ReviewedStingException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString());
|
||||||
|
|
||||||
|
|
@ -581,13 +581,13 @@ public class VariantContextUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
|
for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
|
||||||
String key = p.getKey();
|
String key = p.getKey();
|
||||||
// if we don't like the key already, don't go anywhere
|
// if we don't like the key already, don't go anywhere
|
||||||
if ( ! inconsistentAttributes.contains(key) ) {
|
if ( ! inconsistentAttributes.contains(key) ) {
|
||||||
boolean alreadyFound = attributes.containsKey(key);
|
boolean alreadyFound = attributes.containsKey(key);
|
||||||
Object boundValue = attributes.get(key);
|
Object boundValue = attributes.get(key);
|
||||||
boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
|
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
|
||||||
|
|
||||||
if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) {
|
if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) {
|
||||||
// we found the value but we're inconsistent, put it in the exclude list
|
// we found the value but we're inconsistent, put it in the exclude list
|
||||||
|
|
@ -604,7 +604,7 @@ public class VariantContextUtils {
|
||||||
|
|
||||||
// if we have more alternate alleles in the merged VC than in one or more of the
|
// if we have more alternate alleles in the merged VC than in one or more of the
|
||||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
|
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
|
||||||
for ( VariantContext vc : VCs ) {
|
for ( final VariantContext vc : VCs ) {
|
||||||
if (vc.alleles.size() == 1)
|
if (vc.alleles.size() == 1)
|
||||||
continue;
|
continue;
|
||||||
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) {
|
||||||
|
|
@ -634,11 +634,11 @@ public class VariantContextUtils {
|
||||||
setValue = MERGE_INTERSECTION;
|
setValue = MERGE_INTERSECTION;
|
||||||
else if ( nFiltered == VCs.size() ) // everything was filtered out
|
else if ( nFiltered == VCs.size() ) // everything was filtered out
|
||||||
setValue = MERGE_FILTER_IN_ALL;
|
setValue = MERGE_FILTER_IN_ALL;
|
||||||
else if ( variantSources.isEmpty() ) // everyone was reference
|
else if ( variantSources.isEmpty() ) // everyone was reference
|
||||||
setValue = MERGE_REF_IN_ALL;
|
setValue = MERGE_REF_IN_ALL;
|
||||||
else {
|
else {
|
||||||
LinkedHashSet<String> s = new LinkedHashSet<String>();
|
final LinkedHashSet<String> s = new LinkedHashSet<String>();
|
||||||
for ( VariantContext vc : VCs )
|
for ( final VariantContext vc : VCs )
|
||||||
if ( vc.isVariant() )
|
if ( vc.isVariant() )
|
||||||
s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() );
|
s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() );
|
||||||
setValue = Utils.join("-", s);
|
setValue = Utils.join("-", s);
|
||||||
|
|
@ -663,7 +663,7 @@ public class VariantContextUtils {
|
||||||
builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
|
builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
|
||||||
|
|
||||||
// Trim the padded bases of all alleles if necessary
|
// Trim the padded bases of all alleles if necessary
|
||||||
VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
|
final VariantContext merged = createVariantContextWithTrimmedAlleles(builder.make());
|
||||||
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
|
if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
|
|
@ -724,7 +724,7 @@ public class VariantContextUtils {
|
||||||
|
|
||||||
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
|
||||||
|
|
||||||
for (Allele a : inputVC.getAlleles()) {
|
for (final Allele a : inputVC.getAlleles()) {
|
||||||
if (a.isSymbolic()) {
|
if (a.isSymbolic()) {
|
||||||
alleles.add(a);
|
alleles.add(a);
|
||||||
originalToTrimmedAlleleMap.put(a, a);
|
originalToTrimmedAlleleMap.put(a, a);
|
||||||
|
|
@ -741,11 +741,9 @@ public class VariantContextUtils {
|
||||||
// example: mixed records such as {TA*,TGA,TG}
|
// example: mixed records such as {TA*,TGA,TG}
|
||||||
boolean hasNullAlleles = false;
|
boolean hasNullAlleles = false;
|
||||||
|
|
||||||
for (Allele a: originalToTrimmedAlleleMap.values()) {
|
for (final Allele a: originalToTrimmedAlleleMap.values()) {
|
||||||
if (a.isNull())
|
if (a.isNull())
|
||||||
hasNullAlleles = true;
|
hasNullAlleles = true;
|
||||||
if (a.isReference())
|
|
||||||
refAllele = a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!hasNullAlleles)
|
if (!hasNullAlleles)
|
||||||
|
|
@ -755,7 +753,7 @@ public class VariantContextUtils {
|
||||||
|
|
||||||
List<Allele> originalAlleles = genotype.getAlleles();
|
List<Allele> originalAlleles = genotype.getAlleles();
|
||||||
List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
List<Allele> trimmedAlleles = new ArrayList<Allele>();
|
||||||
for ( Allele a : originalAlleles ) {
|
for ( final Allele a : originalAlleles ) {
|
||||||
if ( a.isCalled() )
|
if ( a.isCalled() )
|
||||||
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
|
||||||
else
|
else
|
||||||
|
|
@ -837,7 +835,6 @@ public class VariantContextUtils {
|
||||||
public AlleleMapper(Map<Allele, Allele> map) { this.map = map; }
|
public AlleleMapper(Map<Allele, Allele> map) { this.map = map; }
|
||||||
public boolean needsRemapping() { return this.map != null; }
|
public boolean needsRemapping() { return this.map != null; }
|
||||||
public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); }
|
public Collection<Allele> values() { return map != null ? map.values() : vc.getAlleles(); }
|
||||||
|
|
||||||
public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; }
|
public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; }
|
||||||
|
|
||||||
public List<Allele> remap(List<Allele> as) {
|
public List<Allele> remap(List<Allele> as) {
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ import java.util.*;
|
||||||
|
|
||||||
public class WalkerTest extends BaseTest {
|
public class WalkerTest extends BaseTest {
|
||||||
private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false;
|
private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false;
|
||||||
|
private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false;
|
||||||
|
|
||||||
@BeforeMethod
|
@BeforeMethod
|
||||||
public void initializeRandomGenerator() {
|
public void initializeRandomGenerator() {
|
||||||
|
|
@ -58,6 +59,9 @@ public class WalkerTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void maybeValidateSupplementaryFile(final String name, final File resultFile) {
|
public void maybeValidateSupplementaryFile(final String name, final File resultFile) {
|
||||||
|
if ( !ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX )
|
||||||
|
return;
|
||||||
|
|
||||||
File indexFile = Tribble.indexFile(resultFile);
|
File indexFile = Tribble.indexFile(resultFile);
|
||||||
//System.out.println("Putative index file is " + indexFile);
|
//System.out.println("Putative index file is " + indexFile);
|
||||||
if ( indexFile.exists() ) {
|
if ( indexFile.exists() ) {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.broadinstitute.sting.gatk.filters;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks that the Bad Cigar filter works for all kinds of wonky cigars
|
||||||
|
*
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/20/12
|
||||||
|
*/
|
||||||
|
public class BadCigarFilterUnitTest {
|
||||||
|
|
||||||
|
BadCigarFilter filter;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void init() {
|
||||||
|
filter = new BadCigarFilter();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWonkyCigars () {
|
||||||
|
byte[] bases = {'A', 'A', 'A', 'A'};
|
||||||
|
byte[] quals = {30, 30, 30, 30};
|
||||||
|
GATKSAMRecord read;
|
||||||
|
// starting with multiple deletions
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2D4M");
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M2D"); // ending with multiple deletions
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "3M1I1D"); // adjacent indels AND ends in deletion
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1D2M"); // adjacent indels I->D
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1D2I1M"); // adjacent indels D->I
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I2M1D"); // ends in single deletion with insertion in the middle
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "4M1D"); // ends in single deletion
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1D4M"); // starts with single deletion
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "2M1D1D2M"); // adjacent D's
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(bases, quals, "1M1I1I1M"); // adjacent I's
|
||||||
|
Assert.assertTrue(filter.filterOut(read), read.getCigarString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -80,11 +80,15 @@ public class GATKReportUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSimpleGATKReport() {
|
public void testSimpleGATKReport() {
|
||||||
GATKReport report = GATKReport.newSimpleReport("TableName", "a", "b", "Roger", "is", "Awesome");
|
// Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome
|
||||||
report.addRow("a", 'F', 12, 23.45, true);
|
GATKReport report = GATKReport.newSimpleReport("TableName", "Roger", "is", "Awesome");
|
||||||
report.addRow("ans", '3', 24.5, 456L, 2345);
|
|
||||||
report.addRow("hi", null, null, "", 2.3);
|
|
||||||
|
|
||||||
|
// Add data to simple GATK report
|
||||||
|
report.addRow( 12, 23.45, true);
|
||||||
|
report.addRow("ans", '3', 24.5);
|
||||||
|
report.addRow("hi", "", 2.3);
|
||||||
|
|
||||||
|
// Print the report to console
|
||||||
//report.print(System.out);
|
//report.print(System.out);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.activeregionqc;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests CountReadsInActiveRegions
|
||||||
|
*/
|
||||||
|
public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest {
|
||||||
|
@Test
|
||||||
|
public void basicTest() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s",
|
||||||
|
1,
|
||||||
|
Arrays.asList("fcd581aa6befe85c7297509fa7b34edf"));
|
||||||
|
executeTest("CountReadsInActiveRegions:", spec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/7/12
|
||||||
|
*/
|
||||||
|
public class BQSRGathererUnitTest {
|
||||||
|
RecalibrationArgumentCollection RAC;
|
||||||
|
|
||||||
|
private static File recal1 = new File("public/testdata/exampleCSV.csv");
|
||||||
|
private static File recal2 = new File("public/testdata/exampleCSV.2.csv");
|
||||||
|
|
||||||
|
@Test(enabled = false)
|
||||||
|
public void testCombineTwoFiles() {
|
||||||
|
BQSRGatherer gatherer = new BQSRGatherer();
|
||||||
|
List<File> recalFiles = new LinkedList<File> ();
|
||||||
|
File output = new File("foo.csv");
|
||||||
|
|
||||||
|
recalFiles.add(recal1);
|
||||||
|
recalFiles.add(recal2);
|
||||||
|
gatherer.gather(recalFiles, output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||||
|
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
import org.testng.annotations.BeforeClass;
|
import org.testng.annotations.BeforeClass;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
@ -12,37 +14,13 @@ import java.util.BitSet;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Short one line description of the walker.
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* [Long description of the walker]
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* <h2>Input</h2>
|
|
||||||
* <p>
|
|
||||||
* [Description of the Input]
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* <h2>Output</h2>
|
|
||||||
* <p>
|
|
||||||
* [Description of the Output]
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* <h2>Examples</h2>
|
|
||||||
* <pre>
|
|
||||||
* java
|
|
||||||
* -jar GenomeAnalysisTK.jar
|
|
||||||
* -T [walker name]
|
|
||||||
* </pre>
|
|
||||||
*
|
|
||||||
* @author Mauricio Carneiro
|
* @author Mauricio Carneiro
|
||||||
* @since 3/1/12
|
* @since 3/1/12
|
||||||
*/
|
*/
|
||||||
public class ContextCovariateUnitTest {
|
public class ContextCovariateUnitTest {
|
||||||
ContextCovariate covariate;
|
ContextCovariate covariate;
|
||||||
RecalibrationArgumentCollection RAC;
|
RecalibrationArgumentCollection RAC;
|
||||||
Random random;
|
Random random;
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public void init() {
|
public void init() {
|
||||||
|
|
@ -55,49 +33,33 @@ public class ContextCovariateUnitTest {
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testSimpleContexts() {
|
public void testSimpleContexts() {
|
||||||
byte [] quals = createRandomReadQuals(101);
|
byte[] quals = ReadUtils.createRandomReadQuals(10000);
|
||||||
byte [] bbases = createRandomReadBases(101);
|
byte[] bbases = ReadUtils.createRandomReadBases(10000, true);
|
||||||
String bases = stringFrom(bbases);
|
|
||||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||||
|
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS);
|
||||||
CovariateValues values = covariate.getValues(read);
|
CovariateValues values = covariate.getValues(read);
|
||||||
verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases);
|
verifyCovariateArray(values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||||
verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases);
|
verifyCovariateArray(values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||||
verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases);
|
verifyCovariateArray(values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, stringFrom(clippedRead.getReadBases()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) {
|
private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) {
|
||||||
for (int i=0; i<values.length; i++) {
|
for (int i = 0; i < values.length; i++) {
|
||||||
if (i >= contextSize)
|
String expectedContext = null;
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i));
|
if (i >= contextSize) {
|
||||||
else
|
String context = bases.substring(i - contextSize, i);
|
||||||
Assert.assertNull(values[i]);
|
if (!context.contains("N"))
|
||||||
|
expectedContext = context;
|
||||||
|
}
|
||||||
|
Assert.assertEquals(covariate.keyFromBitSet(values[i]), expectedContext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String stringFrom(byte [] array) {
|
private String stringFrom(byte[] array) {
|
||||||
String s = "";
|
String s = "";
|
||||||
for (byte value : array)
|
for (byte value : array)
|
||||||
s += (char) value;
|
s += (char) value;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte [] createRandomReadQuals(int length) {
|
|
||||||
byte [] quals = new byte[length];
|
|
||||||
for (int i=0; i<length; i++)
|
|
||||||
quals[i] = (byte) random.nextInt(50);
|
|
||||||
return quals;
|
|
||||||
}
|
|
||||||
|
|
||||||
private byte [] createRandomReadBases(int length) {
|
|
||||||
byte [] bases = new byte[length];
|
|
||||||
for (int i=0; i<length; i++) {
|
|
||||||
switch(random.nextInt(4)) {
|
|
||||||
case 0: bases[i] = 'A'; break;
|
|
||||||
case 1: bases[i] = 'C'; break;
|
|
||||||
case 2: bases[i] = 'G'; break;
|
|
||||||
case 3: bases[i] = 'T'; break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return bases;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/1/12
|
||||||
|
*/
|
||||||
|
public class CycleCovariateUnitTest {
|
||||||
|
CycleCovariate covariate;
|
||||||
|
RecalibrationArgumentCollection RAC;
|
||||||
|
Random random;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void init() {
|
||||||
|
RAC = new RecalibrationArgumentCollection();
|
||||||
|
covariate = new CycleCovariate();
|
||||||
|
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||||
|
covariate.initialize(RAC);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testSimpleCycles() {
|
||||||
|
short readLength = 10;
|
||||||
|
byte[] quals = ReadUtils.createRandomReadQuals(readLength);
|
||||||
|
byte[] bbases = ReadUtils.createRandomReadBases(readLength, true);
|
||||||
|
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M");
|
||||||
|
read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
|
||||||
|
read.getReadGroup().setPlatform("illumina");
|
||||||
|
|
||||||
|
CovariateValues values = covariate.getValues(read);
|
||||||
|
verifyCovariateArray(values.getMismatches(), (short) 1, (short) 1);
|
||||||
|
|
||||||
|
read.setReadNegativeStrandFlag(true);
|
||||||
|
values = covariate.getValues(read);
|
||||||
|
verifyCovariateArray(values.getMismatches(), readLength, (short) -1);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyCovariateArray(BitSet[] values, short init, short increment) {
|
||||||
|
for (short i = 0; i < values.length; i++) {
|
||||||
|
short actual = Short.decode(covariate.keyFromBitSet(values[i]));
|
||||||
|
int expected = init + (increment * i);
|
||||||
|
// System.out.println(String.format("%d: %d, %d", i, actual, expected));
|
||||||
|
Assert.assertEquals(actual, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -50,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@DataProvider(name = "data")
|
@DataProvider(name = "data")
|
||||||
public Object[][] createData() {
|
public Object[][] createData() {
|
||||||
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dac62fcd25e1052bf18b5707700dda7e");
|
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dba5eab2b9587c1062721b164e4fd9a6");
|
||||||
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "e10c48dd294fb257802d4e73bb50580d");
|
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "de35c93450b46db5fc5516af3c55d62a");
|
||||||
return TestParams.getTests(TestParams.class);
|
return TestParams.getTests(TestParams.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,8 +27,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
|
||||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||||
AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||||
BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,10 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
// ********************************************************************************** //
|
// ********************************************************************************** //
|
||||||
|
|
@ -60,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
public void testMultipleSNPAlleles() {
|
public void testMultipleSNPAlleles() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
|
||||||
Arrays.asList("5af005255240a2186f04cb50851b8b6f"));
|
Arrays.asList("0de4aeed6a52f08ed86a7642c812478b"));
|
||||||
executeTest("test Multiple SNP alleles", spec);
|
executeTest("test Multiple SNP alleles", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -277,52 +279,53 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithIndelAllelesPassedIn1() {
|
public void testWithIndelAllelesPassedIn1() {
|
||||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||||
Arrays.asList("9cd08dc412a007933381e9c76c073899"));
|
Arrays.asList("9cd08dc412a007933381e9c76c073899"));
|
||||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
|
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithIndelAllelesPassedIn2() {
|
public void testWithIndelAllelesPassedIn2() {
|
||||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||||
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||||
Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53"));
|
Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53"));
|
||||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
|
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithIndelAllelesPassedIn3() {
|
public void testMultiSampleIndels() {
|
||||||
|
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||||
|
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||||
|
Arrays.asList("52340d578a708fa709b69ce48987bc9d"));
|
||||||
|
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
|
||||||
|
|
||||||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
|
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
|
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||||
Arrays.asList("2609675a356f2dfc86f8a1d911210978"));
|
Arrays.asList("9566c7abef5ee5829a516d90445b347f"));
|
||||||
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
|
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithIndelAllelesPassedIn4() {
|
public void testGGAwithNoEvidenceInReads() {
|
||||||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
|
||||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
|
|
||||||
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
|
|
||||||
Arrays.asList("4fdd8da77167881b71b3547da5c13f94"));
|
|
||||||
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWithIndelAllelesPassedIn5() {
|
|
||||||
final String vcf = "small.indel.test.vcf";
|
final String vcf = "small.indel.test.vcf";
|
||||||
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + vcf + " -I " + validationDataLocation +
|
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + vcf + " -I " + validationDataLocation +
|
||||||
"NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
|
"NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
|
||||||
Arrays.asList("7d069596597aee5e0d562964036141eb"));
|
Arrays.asList("7d069596597aee5e0d562964036141eb"));
|
||||||
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec4);
|
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// testing SnpEff
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSnpEffAnnotationRequestedWithoutRodBinding() {
|
public void testSnpEffAnnotationRequestedWithoutRodBinding() {
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
|
|
|
||||||
|
|
@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
|
||||||
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
|
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
|
||||||
" -genotypeMergeOptions UNIQUIFY -L 1"),
|
" -genotypeMergeOptions UNIQUIFY -L 1"),
|
||||||
1,
|
1,
|
||||||
Arrays.asList("ab72f4bfb16d3894942149173a087647"));
|
Arrays.asList("ee43a558fd3faeaa447acab89f0001d5"));
|
||||||
executeTest("threeWayWithRefs", spec);
|
executeTest("threeWayWithRefs", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,75 @@
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/5/12
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class BitSetUtilsUnitTest {
|
||||||
|
private static int RANDOM_NUMBERS_TO_TRY = 87380;
|
||||||
|
private static Random random;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void init() {
|
||||||
|
random = GenomeAnalysisEngine.getRandomGenerator();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testLongBitSet() {
|
||||||
|
long[] numbers = {0L, 1L, 428L, 65536L, 239847L, 4611686018427387903L, Long.MAX_VALUE, Long.MIN_VALUE, -1L, -2L, -7L, -128L, -65536L, -100000L};
|
||||||
|
for (long n : numbers)
|
||||||
|
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n);
|
||||||
|
|
||||||
|
for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) {
|
||||||
|
long n = random.nextLong();
|
||||||
|
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); // Because class Random uses a seed with only 48 bits, this algorithm will not return all possible long values.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testShortBitSet() {
|
||||||
|
short[] numbers = {0, 1, 428, 25934, 23847, 16168, Short.MAX_VALUE, Short.MIN_VALUE, -1, -2, -7, -128, -12312, -31432};
|
||||||
|
for (long n : numbers)
|
||||||
|
Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n);
|
||||||
|
|
||||||
|
for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) {
|
||||||
|
short n = (short) random.nextInt();
|
||||||
|
Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testDNAAndBitSetConversion() {
|
||||||
|
String[] dna = {"AGGTGTTGT", "CCCCCCCCCCCCCC", "GGGGGGGGGGGGGG", "TTTTTTTTTTTTTT", "GTAGACCGATCTCAGCTAGT", "AACGTCAATGCAGTCAAGTCAGACGTGGGTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"};
|
||||||
|
|
||||||
|
// Test all contexts of size 1-8.
|
||||||
|
for (long n = 0; n < RANDOM_NUMBERS_TO_TRY; n++)
|
||||||
|
Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(n)))), n);
|
||||||
|
|
||||||
|
// Test the special cases listed in the dna array
|
||||||
|
for (String d : dna)
|
||||||
|
Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testNumberOfBitsToRepresent() {
|
||||||
|
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(0), 0); // Make sure 0 elements need 0 bits to be represented
|
||||||
|
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(1), 1); // Make sure 1 element needs 1 bit to be represented
|
||||||
|
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(3), 2); // Make sure 3 elements need 2 bit to be represented
|
||||||
|
|
||||||
|
for (int i = 1; i < 63; i++) { // Can't test i == 63 because n1 is a negative number
|
||||||
|
long n1 = 1L << i;
|
||||||
|
long n2 = Math.abs(random.nextLong()) % n1;
|
||||||
|
long n3 = n1 | n2;
|
||||||
|
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n3), (n3 == n1) ? i : i + 1);
|
||||||
|
Assert.assertEquals(BitSetUtils.numberOfBitsToRepresent(n1), i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -25,7 +25,6 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils;
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.testng.Assert;
|
import org.testng.Assert;
|
||||||
import org.testng.annotations.BeforeClass;
|
import org.testng.annotations.BeforeClass;
|
||||||
|
|
@ -131,7 +130,8 @@ public class MathUtilsUnitTest extends BaseTest {
|
||||||
int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24};
|
int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24};
|
||||||
MathUtils.RunningAverage r = new MathUtils.RunningAverage();
|
MathUtils.RunningAverage r = new MathUtils.RunningAverage();
|
||||||
|
|
||||||
for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]);
|
for (int i = 0; i < numbers.length; i++)
|
||||||
|
r.add((double) numbers[i]);
|
||||||
|
|
||||||
Assert.assertEquals((long) numbers.length, r.observationCount());
|
Assert.assertEquals((long) numbers.length, r.observationCount());
|
||||||
Assert.assertTrue(r.mean() - 3224.625 < 2e-10);
|
Assert.assertTrue(r.mean() - 3224.625 < 2e-10);
|
||||||
|
|
@ -223,37 +223,14 @@ public class MathUtilsUnitTest extends BaseTest {
|
||||||
return set.isEmpty();
|
return set.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
|
||||||
public void testIntAndBitSetConversion() {
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428);
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847);
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726);
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0);
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1);
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536);
|
|
||||||
Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test(enabled = true)
|
|
||||||
public void testDNAAndBitSetConversion() {
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT");
|
|
||||||
Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31)
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testApproximateLog10SumLog10() {
|
public void testApproximateLog10SumLog10() {
|
||||||
|
|
||||||
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, 1e-3);
|
||||||
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, 1e-3);
|
||||||
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, 1e-3);
|
||||||
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, 1e-3);
|
||||||
|
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
||||||
|
|
@ -266,55 +243,57 @@ public class MathUtilsUnitTest extends BaseTest {
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
||||||
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, 1e-3);
|
||||||
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, 1e-3);
|
||||||
|
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3);
|
||||||
|
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3);
|
||||||
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3);
|
Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testNormalizeFromLog10() {
|
public void testNormalizeFromLog10() {
|
||||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, -1.0, -1.1, -7.8}));
|
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, -1.0, -1.1, -7.8}));
|
||||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, 0.0, -0.1, -6.8}));
|
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[] {0.0, 0.0, 0.0, -0.1, -6.8}));
|
||||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[]{-8.9, -6.7, -9.4, 0.0, -8.9}));
|
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[] {-8.9, -6.7, -9.4, 0.0, -8.9}));
|
||||||
|
|
||||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.0}), new double[]{0.25, 0.25, 0.25, 0.25}));
|
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -1.0, -1.0, -1.0}), new double[] {0.25, 0.25, 0.25, 0.25}));
|
||||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -1.0}), new double[]{0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301}));
|
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -1.0}), new double[] {0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301}));
|
||||||
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -2.0}), new double[]{0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211}));
|
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211}));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Private function used by testNormalizeFromLog10()
|
* Private function used by testNormalizeFromLog10()
|
||||||
*/
|
*/
|
||||||
private boolean compareDoubleArrays(double[] b1, double[] b2) {
|
private boolean compareDoubleArrays(double[] b1, double[] b2) {
|
||||||
if( b1.length != b2.length ) {
|
if (b1.length != b2.length) {
|
||||||
return false; // sanity check
|
return false; // sanity check
|
||||||
}
|
}
|
||||||
|
|
||||||
for( int i=0; i < b1.length; i++ ){
|
for (int i = 0; i < b1.length; i++) {
|
||||||
if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 )
|
if (MathUtils.compareDoubles(b1[i], b2[i]) != 0)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: rpoplin
|
||||||
|
* Date: 3/21/12
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic unit test for QualityUtils class
|
||||||
|
*/
|
||||||
|
public class QualityUtilsUnitTest extends BaseTest {
|
||||||
|
@BeforeClass
|
||||||
|
public void init() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testQualCaches() {
|
||||||
|
Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 20), 0.01, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 20), -2.0, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToProb((byte) 20), 0.99, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 20), -0.0043648054, 1e-6);
|
||||||
|
|
||||||
|
Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 30), 0.001, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 30), -3.0, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToProb((byte) 30), 0.999, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 30), -0.000434511774, 1e-6);
|
||||||
|
|
||||||
|
Assert.assertEquals(QualityUtils.qualToErrorProb((byte) 40), 0.0001, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToErrorProbLog10((byte) 40), -4.0, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToProb((byte) 40), 0.9999, 1e-6);
|
||||||
|
Assert.assertEquals(QualityUtils.qualToProbLog10((byte) 40), -4.34316198e-5, 1e-6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2012, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// our package
|
||||||
|
package org.broadinstitute.sting.utils.activeregion;
|
||||||
|
|
||||||
|
|
||||||
|
// the imports for unit testing.
|
||||||
|
|
||||||
|
|
||||||
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||||
|
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.BeforeSuite;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.lang.reflect.Array;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
public class ActivityProfileUnitTest extends BaseTest {
|
||||||
|
private GenomeLocParser genomeLocParser;
|
||||||
|
private GenomeLoc startLoc;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void init() throws FileNotFoundException {
|
||||||
|
// sequence
|
||||||
|
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||||
|
genomeLocParser = new GenomeLocParser(seq);
|
||||||
|
startLoc = genomeLocParser.createGenomeLoc("chr1", 1, 1, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Basic tests Provider
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private class BasicActivityProfileTestProvider extends TestDataProvider {
|
||||||
|
List<Double> probs;
|
||||||
|
List<ActiveRegion> expectedRegions;
|
||||||
|
int extension = 0;
|
||||||
|
GenomeLoc regionStart = startLoc;
|
||||||
|
|
||||||
|
public BasicActivityProfileTestProvider(final List<Double> probs, final List<ActiveRegion> expectedRegions) {
|
||||||
|
super(BasicActivityProfileTestProvider.class);
|
||||||
|
this.probs = probs;
|
||||||
|
this.expectedRegions = expectedRegions;
|
||||||
|
setName(getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public BasicActivityProfileTestProvider(final List<Double> probs, boolean startActive, int ... startsAndStops) {
|
||||||
|
super(BasicActivityProfileTestProvider.class);
|
||||||
|
this.probs = probs;
|
||||||
|
this.expectedRegions = toRegions(startActive, startsAndStops);
|
||||||
|
setName(getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getName() {
|
||||||
|
return String.format("probs=%s expectedRegions=%s", Utils.join(",", probs), Utils.join(",", expectedRegions));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<ActiveRegion> toRegions(boolean isActive, int[] startsAndStops) {
|
||||||
|
List<ActiveRegion> l = new ArrayList<ActiveRegion>();
|
||||||
|
for ( int i = 0; i < startsAndStops.length - 1; i++) {
|
||||||
|
int start = regionStart.getStart() + startsAndStops[i];
|
||||||
|
int end = regionStart.getStart() + startsAndStops[i+1] - 1;
|
||||||
|
GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end);
|
||||||
|
ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension);
|
||||||
|
l.add(r);
|
||||||
|
isActive = ! isActive;
|
||||||
|
}
|
||||||
|
return l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "BasicActivityProfileTestProvider")
|
||||||
|
public Object[][] makeQualIntervalTestProvider() {
|
||||||
|
new BasicActivityProfileTestProvider(Arrays.asList(1.0), true, 0, 1);
|
||||||
|
new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0), true, 0, 1, 2);
|
||||||
|
new BasicActivityProfileTestProvider(Arrays.asList(0.0, 1.0), false, 0, 1, 2);
|
||||||
|
new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3);
|
||||||
|
new BasicActivityProfileTestProvider(Arrays.asList(1.0, 1.0, 1.0), true, 0, 3);
|
||||||
|
|
||||||
|
return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "BasicActivityProfileTestProvider")
|
||||||
|
public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) {
|
||||||
|
ActivityProfile profile = new ActivityProfile(genomeLocParser, false);
|
||||||
|
|
||||||
|
Assert.assertEquals(profile.parser, genomeLocParser);
|
||||||
|
|
||||||
|
for ( int i = 0; i < cfg.probs.size(); i++ ) {
|
||||||
|
double p = cfg.probs.get(i);
|
||||||
|
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
|
||||||
|
profile.add(loc, p);
|
||||||
|
}
|
||||||
|
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));
|
||||||
|
|
||||||
|
Assert.assertEquals(profile.size(), cfg.probs.size());
|
||||||
|
Assert.assertEquals(profile.isActiveList, cfg.probs);
|
||||||
|
|
||||||
|
assertRegionsAreEqual(profile.createActiveRegions(0), cfg.expectedRegions);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertRegionsAreEqual(List<ActiveRegion> actual, List<ActiveRegion> expected) {
|
||||||
|
Assert.assertEquals(actual.size(), expected.size());
|
||||||
|
for ( int i = 0; i < actual.size(); i++ ) {
|
||||||
|
Assert.assertTrue(actual.get(i).equalExceptReads(expected.get(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// todo -- test extensions
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
package org.broadinstitute.sting.utils.recalibration;
|
||||||
|
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit tests for on-the-fly recalibration.
|
||||||
|
*
|
||||||
|
* @author Mauricio Carneiro
|
||||||
|
* @since 3/16/12
|
||||||
|
*/
|
||||||
|
public class BaseRecalibrationUnitTest {
|
||||||
|
|
||||||
|
@Test(enabled=true)
|
||||||
|
public void testReadingCSV() {
|
||||||
|
File csv = new File("public/testdata/exampleCSV.csv");
|
||||||
|
BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
|
||||||
|
System.out.println("Success");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReducedReadPileupElement() {
|
public void testReducedReadPileupElement() {
|
||||||
PileupElement readp = new PileupElement(read, 0, false, false, false, false);
|
PileupElement readp = new PileupElement(read, 0, false, false, false, false, false, false);
|
||||||
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false);
|
PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false, false, false, false);
|
||||||
|
|
||||||
Assert.assertFalse(readp.getRead().isReducedRead());
|
Assert.assertFalse(readp.getRead().isReducedRead());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -117,7 +117,7 @@ public class GenotypeLikelihoodsUnitTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testgetQualFromLikelihoods(){
|
public void testgetQualFromLikelihoods() {
|
||||||
double[] likelihoods = new double[]{-1, 0, -2};
|
double[] likelihoods = new double[]{-1, 0, -2};
|
||||||
// qual values we expect for each possible "best" genotype
|
// qual values we expect for each possible "best" genotype
|
||||||
double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294};
|
double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294};
|
||||||
|
|
@ -134,4 +134,33 @@ public class GenotypeLikelihoodsUnitTest {
|
||||||
Assert.assertEquals(v1[i], v2[i], 1e-6);
|
Assert.assertEquals(v1[i], v2[i], 1e-6);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCalculatePLindex(){
|
||||||
|
int counter = 0;
|
||||||
|
for ( int i = 0; i <= 3; i++ ) {
|
||||||
|
for ( int j = i; j <= 3; j++ ) {
|
||||||
|
Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), GenotypeLikelihoods.PLindexConversion[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetAllelePair(){
|
||||||
|
allelePairTest(0, 0, 0);
|
||||||
|
allelePairTest(1, 0, 1);
|
||||||
|
allelePairTest(2, 1, 1);
|
||||||
|
allelePairTest(3, 0, 2);
|
||||||
|
allelePairTest(4, 1, 2);
|
||||||
|
allelePairTest(5, 2, 2);
|
||||||
|
allelePairTest(6, 0, 3);
|
||||||
|
allelePairTest(7, 1, 3);
|
||||||
|
allelePairTest(8, 2, 3);
|
||||||
|
allelePairTest(9, 3, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void allelePairTest(int PLindex, int allele1, int allele2) {
|
||||||
|
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex1, allele1, "allele index " + allele1 + " from PL index " + PLindex + " was not calculated correctly");
|
||||||
|
Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex2, allele2, "allele index " + allele2 + " from PL index " + PLindex + " was not calculated correctly");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -236,6 +236,16 @@ public class VariantContextUnitTest extends BaseTest {
|
||||||
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
Assert.assertEquals(vc.getSampleNames().size(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMatchingAlleles() {
|
||||||
|
List<Allele> alleles = Arrays.asList(ATCref, del);
|
||||||
|
VariantContext vc = new VariantContextBuilder("test", delLoc, delLocStart, delLocStop, alleles).referenceBaseForIndel((byte)'A').make();
|
||||||
|
VariantContext vc2 = new VariantContextBuilder("test2", delLoc, delLocStart+12, delLocStop+12, alleles).referenceBaseForIndel((byte)'A').make();
|
||||||
|
|
||||||
|
Assert.assertTrue(vc.hasSameAllelesAs(vc2));
|
||||||
|
Assert.assertTrue(vc.hasSameAlternateAllelesAs(vc2));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCreatingInsertionVariantContext() {
|
public void testCreatingInsertionVariantContext() {
|
||||||
List<Allele> alleles = Arrays.asList(delRef, ATC);
|
List<Allele> alleles = Arrays.asList(delRef, ATC);
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue