Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-06-18 08:51:48 -04:00
commit 5ec737f008
181 changed files with 108461 additions and 4276 deletions

View File

@ -2,6 +2,7 @@ library(gsalib)
library(ggplot2)
library(gplots)
library(tools)
library(reshape)
#
# Standard command line switch. Can we loaded interactively for development

View File

@ -59,6 +59,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.io.File;
import java.io.FileInputStream;
@ -221,6 +222,10 @@ public class GenomeAnalysisEngine {
if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
// TODO -- REMOVE ME WHEN WE STOP BCF testing
if ( this.getArguments().USE_SLOW_GENOTYPES )
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);

View File

@ -51,11 +51,6 @@ public class ReadProperties {
return includeReadsWithDeletionAtLoci;
}
@Deprecated
public boolean generateExtendedEvents() {
return false;
}
/**
* Gets a list of the files acting as sources of reads.
* @return A list of files storing reads data.

View File

@ -336,6 +336,11 @@ public class GATKArgumentCollection {
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
@Hidden
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
/**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.contexts;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -89,36 +88,9 @@ public class AlignmentContext implements HasGenomeLocation {
* @return
*/
public ReadBackedPileup getBasePileup() {
if(!hasBasePileup())
throw new ReviewedStingException("No base pileup is available. Please check for a base pileup with hasBasePileup() before attempting to retrieve a pileup.");
return basePileup;
}
/** Returns extended event (indel) pileup over the current genomic location. May return null if this context keeps
* only base pileup.
* @return
*/
@Deprecated
public ReadBackedExtendedEventPileup getExtendedEventPileup() {
if(!hasExtendedEventPileup())
throw new ReviewedStingException("No extended event pileup is present.");
return (ReadBackedExtendedEventPileup)basePileup;
}
/**
* Returns true if this alignment context keeps base pileup over the current genomic location.
* TODO: Syntax of AlignmentContext uses hasBasePileup() / hasExtendedEventPileup() as an enumeration mechanism. Change this to a more sensible interface.
* @return
*/
public boolean hasBasePileup() { return !(basePileup instanceof ReadBackedExtendedEventPileup); }
/** Returns true if this alignment context keeps extended event (indel) pileup over the current genomic location.
*
* @return
*/
@Deprecated
public boolean hasExtendedEventPileup() { return basePileup instanceof ReadBackedExtendedEventPileup; }
/**
* Returns true if any reads have been filtered out of the pileup due to excess DoC.
* @return True if reads have been filtered out. False otherwise.

View File

@ -116,19 +116,15 @@ public class AlignmentContextUtils {
*
**/
public static Map<SAMReadGroupRecord, AlignmentContext> splitContextByReadGroup(AlignmentContext context, Collection<SAMReadGroupRecord> readGroups) {
if ( ! context.hasBasePileup() ) {
return Collections.emptyMap();
} else {
HashMap<SAMReadGroupRecord, AlignmentContext> contexts = new HashMap<SAMReadGroupRecord, AlignmentContext>();
HashMap<SAMReadGroupRecord, AlignmentContext> contexts = new HashMap<SAMReadGroupRecord, AlignmentContext>();
for (SAMReadGroupRecord rg : readGroups) {
ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId());
if ( rgPileup != null ) // there we some reads for RG
contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup));
}
return contexts;
for (SAMReadGroupRecord rg : readGroups) {
ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId());
if ( rgPileup != null ) // there we some reads for RG
contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup));
}
return contexts;
}
public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup) {
@ -139,32 +135,16 @@ public class AlignmentContextUtils {
public static AlignmentContext joinContexts(Collection<AlignmentContext> contexts) {
// validation
GenomeLoc loc = contexts.iterator().next().getLocation();
boolean isExtended = contexts.iterator().next().basePileup instanceof ReadBackedExtendedEventPileup;
for(AlignmentContext context: contexts) {
if(!loc.equals(context.getLocation()))
throw new ReviewedStingException("Illegal attempt to join contexts from different genomic locations");
if(isExtended != (context.basePileup instanceof ReadBackedExtendedEventPileup))
throw new ReviewedStingException("Illegal attempt to join simple and extended contexts");
}
AlignmentContext jointContext;
if(isExtended) {
List<ExtendedEventPileupElement> pe = new ArrayList<ExtendedEventPileupElement>();
for(AlignmentContext context: contexts) {
for(PileupElement pileupElement: context.basePileup)
pe.add((ExtendedEventPileupElement)pileupElement);
}
jointContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc,pe));
List<PileupElement> pe = new ArrayList<PileupElement>();
for(AlignmentContext context: contexts) {
for(PileupElement pileupElement: context.basePileup)
pe.add(pileupElement);
}
else {
List<PileupElement> pe = new ArrayList<PileupElement>();
for(AlignmentContext context: contexts) {
for(PileupElement pileupElement: context.basePileup)
pe.add(pileupElement);
}
jointContext = new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe));
}
return jointContext;
return new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe));
}
}

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import java.util.Collection;
import java.util.List;
/**
* The basic downsampler API, with no reads-specific operations
*
* @author David Roazen
*/
public interface Downsampler<T> {
/*
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination.
*/
public void submit( T item );
/*
* Submit a collection of items to the downsampler for consideration.
*/
public void submit( Collection<T> items );
/*
* Are there items that have survived the downsampling process waiting to be retrieved?
*/
public boolean hasDownsampledItems();
/*
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
*/
public List<T> consumeDownsampledItems();
/*
* Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process?
*/
public boolean hasPendingItems();
/*
* Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items.
*/
public void signalEndOfInput();
/*
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
* information.
*/
public void clear();
}

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
*
* @author David Roazen
*/
public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator;
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter;
this.downsampler = downsampler;
fillDownsampledReadsCache();
}
public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) {
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
}
public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
throw new NoSuchElementException("next() called when there are no more items");
}
return downsampledReadsCacheIterator.next();
}
private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
downsampler.submit(nestedSAMIterator.next());
}
if ( ! nestedSAMIterator.hasNext() ) {
downsampler.signalEndOfInput();
}
downsampledReadsCache = downsampler.consumeDownsampledItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
public void close() {
nestedSAMIterator.close();
}
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
*
* @author David Roazen
*/
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private ArrayList<T> selectedReads;
private int cutoffForInclusion;
private static final int RANDOM_POOL_SIZE = 10000;
public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
}
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear();
}
public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead);
}
}
public void submit( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return selectedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> downsampledItems = selectedReads;
clear();
return downsampledItems;
}
public boolean hasPendingItems() {
return false;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
selectedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return false;
}
}

View File

@ -0,0 +1,259 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
*
* @author David Roazen
*/
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private LinkedList<PositionalReadGrouping> pendingReads;
private ArrayList<T> finalizedReads;
public PositionalDownsampler ( int targetCoverage ) {
this.targetCoverage = targetCoverage;
clear();
}
public void submit ( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
updateAndDownsamplePendingReads();
}
reservoir.submit(newRead);
updateCurrentPosition(newRead);
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return pendingReads.size() > 0;
}
public void signalEndOfInput() {
updateAndDownsamplePendingReads();
for ( PositionalReadGrouping group : pendingReads ) {
group.finalizeAllActiveReads();
finalizedReads.addAll(group.getFinalizedReads());
}
pendingReads.clear();
}
public void clear() {
reservoir = new ReservoirDownsampler<T>(targetCoverage);
pendingReads = new LinkedList<PositionalReadGrouping>();
finalizedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return true;
}
private void updateCurrentPosition ( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
}
private boolean readIsPastCurrentPosition ( T read ) {
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
}
private void updateAndDownsamplePendingReads() {
finalizeOutOfScopeReads();
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
downsampleOverlappingGroups();
}
private void finalizeOutOfScopeReads() {
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
boolean noPrecedingUnfinalizedGroups = true;
while ( iter.hasNext() ) {
PositionalReadGrouping currentGroup = iter.next();
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
iter.remove();
finalizedReads.addAll(currentGroup.getFinalizedReads());
}
else {
noPrecedingUnfinalizedGroups = false;
}
}
}
private void downsampleOverlappingGroups() {
int[] groupReadCounts = new int[pendingReads.size()];
int totalCoverage = 0;
int numActiveGroups = 0;
int currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
groupReadCounts[currentGroup] = group.numActiveReads();
totalCoverage += groupReadCounts[currentGroup];
if ( groupReadCounts[currentGroup] > 0 ) {
numActiveGroups++;
}
currentGroup++;
}
if ( totalCoverage <= targetCoverage ) {
return;
}
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
currentGroup = 0;
while ( numReadsToRemove > 0 ) {
if ( groupReadCounts[currentGroup] > 1 ) {
groupReadCounts[currentGroup]--;
numReadsToRemove--;
}
currentGroup = (currentGroup + 1) % groupReadCounts.length;
}
currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
if ( ! group.isFinalized() ) {
group.downsampleActiveReads(groupReadCounts[currentGroup]);
}
currentGroup++;
}
}
private class PositionalReadGrouping {
private List<T> activeReads;
private List<T> finalizedReads;
private int contig;
private int alignmentStart;
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
activeReads = new LinkedList<T>(reads);
finalizedReads = new ArrayList<T>();
this.contig = contig;
this.alignmentStart = alignmentStart;
}
public int numActiveReads() {
return activeReads.size();
}
public boolean isFinalized() {
return activeReads.size() == 0;
}
public List<T> getFinalizedReads() {
return finalizedReads;
}
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
if ( this.contig != contig ) {
finalizeAllActiveReads();
return;
}
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( read.getAlignmentEnd() < position ) {
iter.remove();
finalizedReads.add(read);
}
}
}
public void finalizeAllActiveReads() {
finalizedReads.addAll(activeReads);
activeReads.clear();
}
public void downsampleActiveReads( int numReadsToKeep ) {
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
numReadsToKeep, activeReads.size()));
}
BitSet itemsToKeep = new BitSet(activeReads.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* An extension of the basic downsampler API with reads-specific operations
*
* @author David Roazen
*/
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
/*
* Does this downsampler require that reads be fed to it in coordinate order?
*/
public boolean requiresCoordinateSortOrder();
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
* every read in the stream having an equal chance of being selected for inclusion.
*
* An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985)
*
* @author David Roazen
*/
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private ArrayList<T> reservoir;
private int targetSampleSize;
private int totalReadsSeen;
public ReservoirDownsampler ( int targetSampleSize ) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
}
this.targetSampleSize = targetSampleSize;
clear();
}
public void submit ( T newRead ) {
totalReadsSeen++;
if ( totalReadsSeen <= targetSampleSize ) {
reservoir.add(newRead);
}
else {
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
}
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return reservoir.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
}
public boolean hasPendingItems() {
return false;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
}

View File

@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.*;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
*/
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
private final static int BUFFER_SIZE = 1048576;
protected final File file;
protected OutputStream stream;
protected final VariantContextWriter writer;
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);

View File

@ -51,6 +51,8 @@ import java.util.List;
* @version 0.1
*/
public class VariantContextWriterStub implements Stub<VariantContextWriter>, VariantContextWriter {
public final static boolean UPDATE_CONTIG_HEADERS = true;
/**
* The engine, central to the GATK's processing.
*/
@ -215,7 +217,8 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
}
//vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
if ( UPDATE_CONTIG_HEADERS )
vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
}
outputTracker.getStorage(this).writeHeader(vcfHeader);

View File

@ -40,9 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.ReservoirDownsampler;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -63,7 +61,6 @@ public class LocusIteratorByState extends LocusIterator {
// member fields
//
// -----------------------------------------------------------------------------------------------------------------
private boolean hasExtendedEvents = false; // will be set to true if at least one read had an indel right before the current position
/**
* Used to create new GenomeLocs.
@ -92,26 +89,10 @@ public class LocusIteratorByState extends LocusIterator {
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
// the only purpose of this flag is to shield away a few additional lines of code
// when extended piles are not needed, it may not be even worth it...
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
// current base on the ref. We use a counter-like variable here since clearing the indel event is
// delayed by one base, so we need to remember how long ago we have seen the actual event
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
// we cache it here mainly for convenience
public SAMRecordState(SAMRecord read, boolean extended) {
public SAMRecordState(SAMRecord read) {
this.read = read;
cigar = read.getCigar();
nCigarElements = cigar.numCigarElements();
generateExtendedEvents = extended;
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
@ -150,27 +131,6 @@ public class LocusIteratorByState extends LocusIterator {
return curElement.getOperator();
}
/**
* Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
*
* @return
*/
public boolean hadIndel() {
return (eventLength > 0);
}
public int getEventLength() {
return eventLength;
}
public byte[] getEventBases() {
return insertedBases;
}
public int getReadEventStartOffset() {
return eventStart;
}
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
@ -208,19 +168,6 @@ public class LocusIteratorByState extends LocusIterator {
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
if (generateExtendedEvents && eventDelayedFlag > 0) {
// if we had an indel right before the read ended (i.e. insertion was the last cigar element),
// we keep it until next reference base; then we discard it and this will allow the LocusIterator to
// finally discard this read
eventDelayedFlag--;
if (eventDelayedFlag == 0) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
}
}
return null;
}
}
@ -232,17 +179,6 @@ public class LocusIteratorByState extends LocusIterator {
cigarElementCounter = curElement.getLength();
break;
case I: // insertion w.r.t. the reference
if (generateExtendedEvents) {
// we see insertions only once, when we step right onto them; the position on the read is scrolled
// past the insertion right after that
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
eventLength = curElement.getLength();
eventStart = readOffset;
eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2
// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset);
} // continue onto the 'S' case !
case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
@ -250,19 +186,6 @@ public class LocusIteratorByState extends LocusIterator {
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
if (generateExtendedEvents) {
if (cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
eventLength = curElement.getLength();
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
eventStart = readOffset;
insertedBases = null;
// System.out.println("Deleted "+eventLength +" bases after "+readOffset);
}
}
// should be the same as N case
genomeOffset++;
done = true;
@ -280,21 +203,6 @@ public class LocusIteratorByState extends LocusIterator {
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
if (generateExtendedEvents) {
if (eventDelayedFlag > 0 && done) {
// if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the,
// the flag is 1, we are standing on the reference base right after the indel (so we have to keep it).
// Otherwise, we are away from the previous indel and have to clear our memories...
eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now
// if eventDelayedFlag == 1, an indel occured right before the current base
if (eventDelayedFlag == 0) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
}
}
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}
@ -374,147 +282,69 @@ public class LocusIteratorByState extends LocusIterator {
// this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref:
readStates.collectPendingReads();
int size = 0;
int nDeletions = 0;
int nInsertions = 0;
int nMQ0Reads = 0;
final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
int size = 0; // number of elements in this sample's pileup
int nDeletions = 0; // number of deletions in this sample's pileup
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
// if extended events are requested, and if previous traversal step brought us over an indel in
// at least one read, we emit extended pileup (making sure that it is associated with the previous base,
// i.e. the one right *before* the indel) and do NOT shift the current position on the ref.
// In this case, the subsequent call to next() will emit the normal pileup at the current base
// and shift the position.
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
Map<String, ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<String, ReadBackedExtendedEventPileupImpl>();
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
// get current location on the reference and decrement it by 1: the indels we just stepped over
// are associated with the *previous* reference base
GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1);
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
boolean hasBeenSampled = false;
for (final String sample : samples) {
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
List<ExtendedEventPileupElement> indelPile = new ArrayList<ExtendedEventPileupElement>(readStates.size(sample));
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
int nextElementLength = nextElement.getLength();
size = 0;
nDeletions = 0;
nInsertions = 0;
nMQ0Reads = 0;
int maxDeletionLength = 0;
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next();
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
final int eventLength = state.getEventLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
if (op == CigarOperator.D) {
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
ExtendedEventPileupElement pileupElement;
if (state.getEventBases() == null) { // Deletion event
nDeletions++;
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
}
else { // Insertion event
nInsertions++;
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
}
if (read.getMappingQuality() == 0)
nMQ0Reads++;
indelPile.add(pileupElement);
}
// this read has no indel so add it to the pileup as a NOEVENT:
// a deletion that didn't start here (therefore, not an extended event)
// we add (mis)matches as no events.
else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) {
size++;
indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset));
nDeletions++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I)
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
if (indelPile.size() != 0)
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
}
hasExtendedEvents = false; // we are done with extended events prior to current ref base
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
}
else { // this is a regular event pileup (not extended)
final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
size = 0; // number of elements in this sample's pileup
nDeletions = 0; // number of deletions in this sample's pileup
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
int nextElementLength = nextElement.getLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (op == CigarOperator.D) {
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I)
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
}
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
}
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
}
}
@ -546,9 +376,7 @@ public class LocusIteratorByState extends LocusIterator {
while (it.hasNext()) {
SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if (state.hadIndel() && readInfo.generateExtendedEvents())
hasExtendedEvents = true;
else if (op == null) {
if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
@ -757,12 +585,9 @@ public class LocusIteratorByState extends LocusIterator {
int readCount = 0;
for (SAMRecord read : reads) {
if (readCount < maxReads) {
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents());
SAMRecordState state = new SAMRecordState(read);
state.stepForwardOnGenome();
newReadStates.add(state);
// TODO: What if we downsample the extended events away?
if (state.hadIndel())
hasExtendedEvents = true;
readCount++;
}
}

View File

@ -251,7 +251,7 @@ public class VariantContextAdaptors {
Map<String, Object> attributes = new HashMap<String, Object>();
Collection<Genotype> genotypes = new ArrayList<Genotype>();
Genotype call = new Genotype(name, genotypeAlleles);
Genotype call = GenotypeBuilder.create(name, genotypeAlleles);
// add the call to the genotype list, and then use this list to create a VariantContext
genotypes.add(call);
@ -344,7 +344,7 @@ public class VariantContextAdaptors {
alleles.add(allele2);
}
Genotype g = new Genotype(samples[i], myAlleles);
Genotype g = GenotypeBuilder.create(samples[i], myAlleles);
genotypes.add(g);
}

View File

@ -53,19 +53,6 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
dataProvider.getShard().getReadMetrics().incrementNumIterations();
if ( locus.hasExtendedEventPileup() ) {
// if the alignment context we received holds an "extended" pileup (i.e. pileup of insertions/deletions
// associated with the current site), we need to update the location. The updated location still starts
// at the current genomic position, but it has to span the length of the longest deletion (if any).
location = engine.getGenomeLocParser().setStop(location,location.getStop()+locus.getExtendedEventPileup().getMaxDeletionLength());
// it is possible that the new expanded location spans the current shard boundary; the next method ensures
// that when it is the case, the reference sequence held by the ReferenceView will be reloaded so that
// the view has all the bases we are gonna need. If the location fits within the current view bounds,
// the next call will not do anything to the view:
referenceView.expandBoundsToAccomodateLoc(location);
}
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
ReferenceContext refContext = referenceView.getReferenceContext(location);

View File

@ -34,9 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.PrintStream;
@ -79,13 +77,11 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
String rods = getReferenceOrderedData( tracker );
if ( context.hasBasePileup() ) {
ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
}
ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
return 1;
}

View File

@ -30,11 +30,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
@ -72,7 +70,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
// we care only about het calls
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null || !context.hasBasePileup() )
if ( context == null )
continue;
final ReadBackedPileup pileup = context.getBasePileup();

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -21,15 +22,12 @@ import java.util.*;
*/
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
public void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, final GenotypeBuilder gb) {
Double ratio = annotateSNP(stratifiedContext, vc, g);
if (ratio == null)
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", ratio.doubleValue()));
return map;
return;
gb.attribute(getKeyNames().get(0), Double.valueOf(String.format("%.2f", ratio.doubleValue())));
}
private Double annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
@ -51,9 +49,6 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
if ( altAlleles.size() == 0 )
return null;
if ( !stratifiedContext.hasBasePileup() )
return null;
final String bases = new String(stratifiedContext.getBasePileup().getBases());
if ( bases.length() == 0 )
return null;

View File

@ -59,8 +59,6 @@ public class BaseCounts extends InfoFieldAnnotation {
int[] counts = new int[4];
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
if ( !sample.getValue().hasBasePileup() )
continue;
for (byte base : sample.getValue().getBasePileup().getBases() ) {
int index = BaseUtils.simpleBaseToBaseIndex(base);
if ( index != -1 )

View File

@ -44,7 +44,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : 0;
depth += sample.getValue().getBasePileup().depthOfCoverage();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", depth));
return map;

View File

@ -1,12 +1,12 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
@ -14,6 +14,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -44,22 +45,17 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
private static final String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
public void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, GenotypeBuilder gb) {
if ( g == null || !g.isCalled() )
return null;
return;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
annotateSNP(stratifiedContext, vc, gb);
else if ( vc.isIndel() )
annotateIndel(stratifiedContext, vc, gb);
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
private void annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, GenotypeBuilder gb) {
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlleles() )
@ -72,22 +68,18 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
}
// we need to add counts in the correct order
Integer[] counts = new Integer[alleleCounts.size()];
int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
return toADAnnotation(counts);
gb.AD(counts);
}
private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
private void annotateIndel(AlignmentContext stratifiedContext, VariantContext vc, GenotypeBuilder gb) {
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
if ( pileup == null )
return null;
return;
final HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
alleleCounts.put(REF_ALLELE, 0);
@ -123,16 +115,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
}
}
Integer[] counts = new Integer[alleleCounts.size()];
int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(REF_ALLELE);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) );
return toADAnnotation(counts);
}
private final Map<String, Object> toADAnnotation(final Integer[] counts) {
return Collections.singletonMap(getKeyNames().get(0), (Object)Arrays.asList(counts));
gb.AD(counts);
}
private String getAlleleRepresentation(Allele allele) {
@ -145,7 +133,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("AD"); }
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(

View File

@ -296,7 +296,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
for ( String sample : stratifiedContexts.keySet() ) {
final AlignmentContext context = stratifiedContexts.get(sample);
if ( context == null || !context.hasBasePileup() )
if ( context == null )
continue;
final ReadBackedPileup pileup = context.getBasePileup();

View File

@ -74,9 +74,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2;
if ( !context.hasBasePileup() )
return null;
final ReadBackedPileup pileup = context.getBasePileup();
// Compute all haplotypes consistent with the current read pileup
@ -86,7 +83,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
if (haplotypes != null) {
for (final Genotype genotype : vc.getGenotypes()) {
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
if (thisContext != null && thisContext.hasBasePileup()) {
if (thisContext != null) {
final ReadBackedPileup thisPileup = thisContext.getBasePileup();
if (vc.isSNP())
scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense

View File

@ -31,9 +31,6 @@ public class LowMQ extends InfoFieldAnnotation {
double total = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
{
if ( !sample.getValue().hasBasePileup() )
continue;
for ( PileupElement p : sample.getValue().getBasePileup() )
{
if ( p.getMappingQual() == 0 ) { mq0 += 1; }

View File

@ -31,12 +31,10 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA
int mq0 = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
final AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
}
Map<String, Object> map = new HashMap<String, Object>();

View File

@ -36,33 +36,30 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Count for each sample of mapping quality zero reads
*/
public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
public void annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context,
VariantContext vc, Genotype g, GenotypeBuilder gb) {
if ( g == null || !g.isCalled() )
return null;
return;
int mq0 = 0;
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0));
return map;
gb.attribute(getKeyNames().get(0), mq0);
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }

View File

@ -31,12 +31,10 @@ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements E
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
depth += context.size();
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
}
if (depth > 0) {

View File

@ -28,15 +28,13 @@ public class NBaseCount extends InfoFieldAnnotation {
int countRegularBaseSolid = 0;
for( final AlignmentContext context : stratifiedContexts.values() ) {
if ( context.hasBasePileup() ) { // must be called as getBasePileup may throw error when pileup has no bases
for( final PileupElement p : context.getBasePileup()) {
final String platform = p.getRead().getReadGroup().getPlatform();
if( platform != null && platform.toUpperCase().contains("SOLID") ) {
if( BaseUtils.isNBase( p.getBase() ) ) {
countNBaseSolid++;
} else if( BaseUtils.isRegularBase( p.getBase() ) ) {
countRegularBaseSolid++;
}
for( final PileupElement p : context.getBasePileup()) {
final String platform = p.getRead().getReadGroup().getPlatform();
if( platform != null && platform.toUpperCase().contains("SOLID") ) {
if( BaseUtils.isNBase( p.getBase() ) ) {
countNBaseSolid++;
} else if( BaseUtils.isRegularBase( p.getBase() ) ) {
countRegularBaseSolid++;
}
}
}

View File

@ -48,7 +48,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
if ( context == null )
continue;
depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : 0;
depth += context.getBasePileup().depthOfCoverage();
}
if ( depth == 0 )

View File

@ -42,12 +42,10 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE )
qualities[index++] = p.getMappingQual();
}
final ReadBackedPileup pileup = context.getBasePileup();
for (PileupElement p : pileup ) {
if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE )
qualities[index++] = p.getMappingQual();
}
}

View File

@ -63,9 +63,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
continue;
}
if (!context.hasBasePileup())
continue;
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup == null)
continue;

View File

@ -35,11 +35,9 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
deletions += pileup.getNumberOfDeletions();
depth += pileup.getNumberOfElements();
}
final ReadBackedPileup pileup = context.getBasePileup();
deletions += pileup.getNumberOfDeletions();
depth += pileup.getNumberOfElements();
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));

View File

@ -39,18 +39,16 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
for ( PileupElement p : pileup ) {
if(ReadUtils.is454Read(p.getRead()))
reads454++;
else if (ReadUtils.isSOLiDRead(p.getRead()))
readsSolid++;
else if (ReadUtils.isIlluminaRead(p.getRead()))
readsIllumina++;
else
readsOther++;
}
final ReadBackedPileup pileup = context.getBasePileup();
for ( PileupElement p : pileup ) {
if(ReadUtils.is454Read(p.getRead()))
reads454++;
else if (ReadUtils.isSOLiDRead(p.getRead()))
readsSolid++;
else if (ReadUtils.isIlluminaRead(p.getRead()))
readsIllumina++;
else
readsOther++;
}
}

View File

@ -305,12 +305,10 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
// if the reference base is not ambiguous, we can annotate
Map<String, AlignmentContext> stratifiedContexts;
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
if ( context.hasBasePileup() ) {
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
annotatedVCs = new ArrayList<VariantContext>(VCs.size());
for ( VariantContext vc : VCs )
annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
}
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
annotatedVCs = new ArrayList<VariantContext>(VCs.size());
for ( VariantContext vc : VCs )
annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
}
for ( VariantContext annotatedVC : annotatedVCs )

View File

@ -261,24 +261,22 @@ public class VariantAnnotatorEngine {
}
private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( requestedGenotypeAnnotations.size() == 0 )
if ( requestedGenotypeAnnotations.isEmpty() )
return vc.getGenotypes();
GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
for ( final Genotype genotype : vc.getGenotypes() ) {
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null ) {
genotypes.add(genotype);
continue;
} else {
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb);
}
genotypes.add(gb.make());
}
Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
Map<String, Object> result = annotation.annotate(tracker, walker, ref, context, vc, genotype);
if ( result != null )
genotypeAnnotations.putAll(result);
}
genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
}
return genotypes;

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
@ -13,8 +14,9 @@ import java.util.Map;
public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation {
// return annotations for the given contexts/genotype split by sample
public abstract Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker,
ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g);
public abstract void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker,
ReferenceContext ref, AlignmentContext stratifiedContext,
VariantContext vc, Genotype g, GenotypeBuilder gb );
// return the descriptions used for the VCF FORMAT meta field
public abstract List<VCFFormatHeaderLine> getDescriptions();

View File

@ -204,8 +204,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
}
for ( final Genotype g : vc_input.getGenotypes() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
boolean genotypeIsPhased = true;
String sample = g.getSampleName();
@ -271,7 +269,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
// Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB.
// Which, by definition, are prob of hom ref, het and hom var.
Double probWrongGenotype, genotypeQuality;
double probWrongGenotype, genotypeQuality;
Double homRefProbability = Double.valueOf(beagleProbabilities.get(0));
Double hetProbability = Double.valueOf(beagleProbabilities.get(1));
Double homVarProbability = Double.valueOf(beagleProbabilities.get(2));
@ -300,7 +298,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else
genotypeQuality = log10(probWrongGenotype);
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getExtendedAttributes());
// get original encoding and add to keynotype attributes
String a1, a2, og;
@ -328,7 +326,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else {
originalAttributes.put("OG",".");
}
Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
Genotype imputedGenotype = new GenotypeBuilder(g).alleles(alleles).log10PError(genotypeQuality).attributes(originalAttributes).phased(genotypeIsPhased).make();
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
beagleVarCounts++;
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
@ -26,147 +27,135 @@ import java.util.*;
*/
public class BQSRKeyManager {
private final List<Covariate> requiredCovariates;
private final List<Covariate> optionalCovariates;
private final List<RequiredCovariateInfo> requiredCovariatesInfo;
private final List<OptionalCovariateInfo> optionalCovariatesInfo;
private final Covariate[] requiredCovariates;
private final Covariate[] optionalCovariates;
private final RequiredCovariateInfo[] requiredCovariatesInfo;
private final OptionalCovariateInfo[] optionalCovariatesInfo;
private final Map<String, Short> covariateNameToIDMap;
private int nRequiredBits; // Number of bits used to represent the required covariates
private int nOptionalBits; // Number of bits used to represent the standard covaraites
private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs
private final int totalNumberOfBits; // Sum of all of the above plus the event bits
private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset
private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset
private final int optionalCovariateOffset;
private final int optionalCovariateIDOffset;
private final long optionalCovariateMask; // Standard mask for optional covariates key
private final long optionalCovariateIDMask; // Standard mask for optional covariates order key
private final long eventIDMask; // Standard mask for event ID
/**
* Initializes the KeyManager with the total number of covariates to use
*
* @param requiredCovariates the ordered list of required covariates
* @param optionalCovariates the ordered list of optional covariates
*/
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) {
this.requiredCovariates = new ArrayList<Covariate>(requiredCovariates);
this.optionalCovariates = new ArrayList<Covariate>(optionalCovariates);
requiredCovariatesInfo = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list
optionalCovariatesInfo = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay)
public BQSRKeyManager(final List<Covariate> requiredCovariates, final List<Covariate> optionalCovariates) {
this.requiredCovariates = new Covariate[requiredCovariates.size()];
this.optionalCovariates = new Covariate[optionalCovariates.size()];
requiredCovariatesInfo = new RequiredCovariateInfo[requiredCovariates.size()]; // initialize the required covariates list
optionalCovariatesInfo = new OptionalCovariateInfo[optionalCovariates.size()]; // initialize the optional covariates list (size may be 0, it's okay)
covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
nRequiredBits = 0;
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management
int nBits = required.numberOfBits(); // number of bits used by this covariate
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
requiredCovariatesInfo.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate
for (int i = 0; i < requiredCovariates.size(); i++) { // create a list of required covariates with the extra information for key management
final Covariate required = requiredCovariates.get(i);
final int nBits = required.numberOfBits(); // number of bits used by this covariate
final long mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
this.requiredCovariates[i] = required;
requiredCovariatesInfo[i] = new RequiredCovariateInfo(nBits, nRequiredBits, mask, required); // Create an object for this required covariate
nRequiredBits += nBits;
}
final int bitsInEventType = numberOfBitsToRepresent(EventType.values().length);
eventIDMask = genericMask(nRequiredBits, bitsInEventType);
short id = 0;
nOptionalBits = 0;
for (Covariate optional : optionalCovariates) {
int nBits = optional.numberOfBits(); // number of bits used by this covariate
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate
BitSet optionalID = bitSetFromId(id); // calculate the optional covariate ID for this covariate
optionalCovariatesInfo.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
int nOptionalBits = 0;
for (int i = 0; i < optionalCovariates.size(); i++) {
final Covariate optional = optionalCovariates.get(i);
nOptionalBits = Math.max(nOptionalBits, optional.numberOfBits()); // optional covariates are represented by the number of bits needed by biggest covariate
this.optionalCovariates[i] = optional;
optionalCovariatesInfo[i] = new OptionalCovariateInfo(id, optional);
final String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
covariateNameToIDMap.put(covariateName, id);
id++;
}
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key
optionalCovariateOffset = nRequiredBits + bitsInEventType;
optionalCovariateMask = genericMask(optionalCovariateOffset, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDOffset = nRequiredBits + bitsInEventType + nOptionalBits;
final int nOptionalIDBits = numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateIDMask = genericMask(optionalCovariateIDOffset, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
final int totalNumberOfBits = optionalCovariateIDOffset + nOptionalIDBits; // total number of bits used in the final key
if ( totalNumberOfBits > 64 )
throw new UserException.BadInput("The total number of bits used for the master BQSR key is greater than 64 and cannot be represented in a long");
}
/**
* Generates one key per optional covariate.
* Generates one key given the optional covariate (or none if it is null)
*
* Keys include all required covariates, the standard covariate and the event type.
*
* Example allKeys:
* RG, QUAL, CYCLE, CONTEXT
*
* List of BitSets returned by this example (given eventType):
* RG, QUAL, CYCLE, EVENT
* RG, QUAL, CONTEXT, EVENT
*
* Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type
*
* @param allKeys The keys in bitset representation for each covariate
* @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
* @return one key in bitset representation per covariate
* @param allKeys The keys in long representation for each covariate (includes all optional covariates, not just the one requested)
* @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
* @return one key in long representation (non-negative) or -1 for a bad key
*/
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) {
List<BitSet> allBitSets = new ArrayList<BitSet>(); // Generate one key per optional covariate
public long createMasterKey(final long[] allKeys, final EventType eventType, final int optionalCovariateIndex) {
BitSet eventBitSet = bitSetFromEvent(eventType); // create a bitset with the event type
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits
int covariateIndex = 0;
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
int keyIndex = 0;
long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on
for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo)
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set
masterKey |= (allKeys[keyIndex++] << infoRequired.offset);
for (OptionalCovariateInfo infoOptional : optionalCovariatesInfo) {
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys
if (covariateKey == null)
continue; // do not add nulls to the final set of keys.
final long eventKey = keyFromEvent(eventType); // create a key for the event type
masterKey |= (eventKey << nRequiredBits);
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate
optionalKey.or(requiredKey); // import all the required covariates
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(optionalKey); // add this key to the list of keys
if (optionalCovariateIndex >= 0 && optionalCovariateIndex < optionalCovariates.length) {
final long covariateKey = allKeys[keyIndex + optionalCovariateIndex];
if (covariateKey < 0) // do not add "nulls" to the final set of keys
return -1;
masterKey |= (covariateKey << optionalCovariateOffset);
masterKey |= (optionalCovariatesInfo[optionalCovariateIndex].covariateID << optionalCovariateIDOffset);
}
if (optionalCovariatesInfo.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key)
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(requiredKey); // add this key to the list of keys
}
return allBitSets;
return masterKey;
}
/**
* Generates one bitset key for the covariates represented in Object[] key
* Generates one key for the covariates represented in Object[] key
*
* The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file)
* and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many.
* and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one key, not many.
*
* Example key:
* RG, QUAL, CYCLE, CYCLE_ID, EventType
*
* @param key list of objects produced by the required covariates followed by one or zero optional covariates.
* @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface.
* @return a key representing these objects.
*/
public BitSet bitSetFromKey(Object[] key) {
BitSet bitSetKey = new BitSet(totalNumberOfBits);
public long longFromKey(Object[] key) {
int requiredCovariate = 0;
for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) {
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key
}
if (optionalCovariatesInfo.size() > 0) {
int optionalCovariate = requiredCovariatesInfo.size(); // the optional covariate index in the key array
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
OptionalCovariateInfo infoOptional = optionalCovariatesInfo.get(covariateID); // so we can get the optional covariate information
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
}
int eventIndex = key.length - 1; // the event type is always the last key
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on
for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo)
masterKey |= (infoRequired.covariate.longFromKey(key[requiredCovariate++]) << infoRequired.offset);
return bitSetKey;
final int eventIndex = key.length - 1; // the event type is always the last key
final long eventKey = keyFromEvent((EventType) key[eventIndex]); // create a key for the event type
masterKey |= (eventKey << nRequiredBits);
if (optionalCovariatesInfo.length > 0) {
final int covariateIndex = requiredCovariatesInfo.length; // the optional covariate index in the key array
final int covariateIDIndex = covariateIndex + 1; // the optional covariate ID index is right after the optional covariate's
final short covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
final OptionalCovariateInfo infoOptional = optionalCovariatesInfo[covariateID]; // so we can get the optional covariate information
final long covariateKey = infoOptional.covariate.longFromKey(key[covariateIndex]); // convert the optional covariate key into a bitset using the covariate's interface
masterKey |= (covariateKey << optionalCovariateOffset);
masterKey |= (infoOptional.covariateID << optionalCovariateIDOffset);
}
return masterKey;
}
/**
@ -176,116 +165,82 @@ public class BQSRKeyManager {
* @param id the string or short representation of the optional covariate id
* @return the short representation of the optional covariate id.
*/
private short parseCovariateID(Object id) {
private short parseCovariateID(final Object id) {
return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id;
}
/**
* Generates a key set of objects from a combined bitset key.
* Generates a key set of objects from a combined master key.
*
* Masks out each covariate independently and decodes their values (Object) into a keyset
*
* @param key the bitset representation of the keys
* @param master the master representation of the keys
* @return an object array with the values for each key
*/
public List<Object> keySetFrom(BitSet key) {
List<Object> objectKeys = new ArrayList<Object>();
public List<Object> keySetFrom(final long master) {
final List<Object> objectKeys = new ArrayList<Object>();
for (RequiredCovariateInfo info : requiredCovariatesInfo) {
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface
final long covariateKey = extractKeyFromMaster(master, info.mask, info.offset); // get the covariate's key
objectKeys.add(info.covariate.formatKey(covariateKey)); // convert the key to object using covariate's interface
}
if (optionalCovariatesInfo.size() > 0) {
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is)
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short
Covariate covariate = optionalCovariatesInfo.get(id).covariate; // get the corresponding optional covariate object
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
if (optionalCovariatesInfo.length > 0) {
final long covKey = extractKeyFromMaster(master, optionalCovariateMask, optionalCovariateOffset); // get the covariate's key
final int covIDKey = (int)extractKeyFromMaster(master, optionalCovariateIDMask, optionalCovariateIDOffset); // get the covariate's id (to identify which covariate this is)
Covariate covariate = optionalCovariatesInfo[(short)covIDKey].covariate; // get the corresponding optional covariate object
objectKeys.add(covariate.formatKey(covKey)); // add the optional covariate key to the key set
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
}
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
objectKeys.add(EventType.eventFrom((int)extractKeyFromMaster(master, eventIDMask, nRequiredBits))); // add the event type object to the key set
return objectKeys;
}
public List<Covariate> getRequiredCovariates() {
public Covariate[] getRequiredCovariates() {
return requiredCovariates;
}
public List<Covariate> getOptionalCovariates() {
public Covariate[] getOptionalCovariates() {
return optionalCovariates;
}
/**
* Translates a masked bitset into a bitset starting at 0
*
* @param key the masked out bitset
* @param n the number of bits to chop
* @return a translated bitset starting at 0 for the covariate machinery to decode
*/
private BitSet chopNBitsFrom(BitSet key, int n) {
BitSet choppedKey = new BitSet();
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
return choppedKey;
public int getNumRequiredCovariates() {
return requiredCovariates.length;
}
public int getNumOptionalCovariates() {
return optionalCovariates.length;
}
/**
* Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key
* Creates a mask for the requested covariate to extract the relevant key from a combined master key
*
* @param leadingBits the index of the covariate in the ordered covariate list
* @param nBits the number of bits needed by the Covariate to represent its values in BitSet form
* @return the bitset relevant to the covariate
* @param offset the offset into the master key
* @param nBits the number of bits needed by the Covariate to represent its values
* @return the mask relevant to the covariate
*/
private BitSet genericMask(int leadingBits, int nBits) {
BitSet mask = new BitSet(leadingBits + nBits);
mask.set(leadingBits, leadingBits + nBits);
private long genericMask(final int offset, final int nBits) {
long mask = 0L;
for ( int i = 0; i < nBits; i++ )
mask |= 1L << (offset+i);
return mask;
}
/**
* Decodes the event type (enum) from the full bitset key
*
* @param fullKey the full key of all covariates + event type
* @return the decoded event type.
*/
private EventType eventFromBitSet(BitSet fullKey) {
BitSet eventKey = new BitSet();
int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits;
for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1))
eventKey.set(i - firstBitIndex);
return EventType.eventFrom(BitSetUtils.shortFrom(eventKey));
private long extractKeyFromMaster(final long master, final long mask, final int offset) {
long key = master & mask;
return key >> offset;
}
// cache the BitSet representing an event since it's otherwise created a massive amount of times
private static final Map<EventType, BitSet> eventTypeCache = new HashMap<EventType, BitSet>(EventType.values().length);
// cache the key representing an event since it's otherwise created a massive amount of times
private static final long[] eventTypeCache = new long[EventType.values().length]; // event IDs must be longs so that bit-fiddling works
static {
for (final EventType eventType : EventType.values())
eventTypeCache.put(eventType, BitSetUtils.bitSetFrom(eventType.index));
eventTypeCache[eventType.index] = (long)eventType.index;
}
private BitSet bitSetFromEvent(final EventType eventType) {
return eventTypeCache.get(eventType);
}
private BitSet bitSetFromId(final short id) {
return BitSetUtils.bitSetFrom(id);
}
private int bitsInEventType() {
return BitSetUtils.numberOfBitsToRepresent(EventType.values().length);
}
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
}
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
BitSet bitSet = (BitSet) key.clone();
bitSet.and(mask);
return chopNBitsFrom(bitSet, leadingBits);
private long keyFromEvent(final EventType eventType) {
return eventTypeCache[eventType.index];
}
@Override
@ -297,22 +252,22 @@ public class BQSRKeyManager {
if (this == other)
return true;
if (requiredCovariatesInfo.size() != other.requiredCovariatesInfo.size() ||
optionalCovariatesInfo.size() != other.optionalCovariatesInfo.size())
if (requiredCovariatesInfo.length != other.requiredCovariatesInfo.length ||
optionalCovariatesInfo.length != other.optionalCovariatesInfo.length)
return false;
for (int i = 0; i < requiredCovariates.size(); i++) {
Covariate myRequiredCovariate = requiredCovariates.get(i);
Covariate otherRequiredCovariate = other.requiredCovariates.get(i);
for (int i = 0; i < requiredCovariates.length; i++) {
Covariate myRequiredCovariate = requiredCovariates[i];
Covariate otherRequiredCovariate = other.requiredCovariates[i];
String thisName = myRequiredCovariate.getClass().getSimpleName();
String otherName = otherRequiredCovariate.getClass().getSimpleName();
if (!thisName.equals(otherName))
return false;
}
for (int i = 0; i < optionalCovariates.size(); i++) {
Covariate myOptionalCovariate = optionalCovariates.get(i);
Covariate otherOptionalCovariate = other.optionalCovariates.get(i);
for (int i = 0; i < optionalCovariates.length; i++) {
Covariate myOptionalCovariate = optionalCovariates[i];
Covariate otherOptionalCovariate = other.optionalCovariates[i];
String thisName = myOptionalCovariate.getClass().getSimpleName();
String otherName = otherOptionalCovariate.getClass().getSimpleName();
if (!thisName.equals(otherName))
@ -322,27 +277,50 @@ public class BQSRKeyManager {
return true;
}
/**
* Calculates the number of bits necessary to represent a given number of elements
*
* @param numberOfElements the number of elements to represent (must be positive)
* @return the number of bits necessary to represent this many elements
*/
public static int numberOfBitsToRepresent(long numberOfElements) {
if (numberOfElements < 0)
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
if (numberOfElements == 1L)
return 1; // special case
int n = 0;
numberOfElements--;
while (numberOfElements > 0) {
numberOfElements = numberOfElements >> 1;
n++;
}
return n;
}
/**
* Aggregate information for each Covariate
*/
class RequiredCovariateInfo {
public final int bitsBefore; // number of bits before this covariate in the combined bitset key
public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
private static class RequiredCovariateInfo {
public final int nBits; // number of bits for this key
public final int offset; // the offset into the master key
public final long mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
public final Covariate covariate; // this allows reverse lookup of the Covariates in order
RequiredCovariateInfo(int bitsBefore, BitSet mask, Covariate covariate) {
this.bitsBefore = bitsBefore;
RequiredCovariateInfo(final int nBits, final int offset, final long mask, final Covariate covariate) {
this.nBits = nBits;
this.offset = offset;
this.mask = mask;
this.covariate = covariate;
}
}
class OptionalCovariateInfo {
public final BitSet covariateID; // cache the covariate ID
private static class OptionalCovariateInfo {
public final long covariateID; // cache the covariate ID (must be a long so that bit-fiddling works)
public final Covariate covariate;
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) {
OptionalCovariateInfo(final long covariateID, final Covariate covariate) {
this.covariateID = covariateID;
this.covariate = covariate;
}

View File

@ -26,15 +26,12 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -44,8 +41,7 @@ import java.util.BitSet;
public class ContextCovariate implements StandardCovariate {
private int mismatchesContextSize;
private int insertionsContextSize;
private int deletionsContextSize;
private int indelsContextSize;
private byte LOW_QUAL_TAIL;
@ -53,42 +49,33 @@ public class ContextCovariate implements StandardCovariate {
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE;
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE;
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE;
indelsContextSize = RAC.INDELS_CONTEXT_SIZE;
if (mismatchesContextSize > MAX_DNA_CONTEXT)
throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize));
if (indelsContextSize > MAX_DNA_CONTEXT)
throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize));
LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
if (mismatchesContextSize <= 0 || indelsContextSize <= 0)
throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize));
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int l = read.getReadLength();
BitSet[] mismatches = new BitSet[l];
BitSet[] insertions = new BitSet[l];
BitSet[] deletions = new BitSet[l];
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag();
byte[] bases = clippedRead.getReadBases();
if (negativeStrand)
bases = BaseUtils.simpleReverseComplement(bases);
for (int i = 0; i < clippedRead.getReadLength(); i++) {
mismatches[i] = contextWith(bases, i, mismatchesContextSize);
insertions[i] = contextWith(bases, i, insertionsContextSize);
deletions[i] = contextWith(bases, i, deletionsContextSize);
final int readLength = clippedRead.getReadLength();
for (int i = 0; i < readLength; i++) {
final long indelKey = contextWith(bases, i, indelsContextSize);
values.addCovariate(contextWith(bases, i, mismatchesContextSize), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i));
}
if (negativeStrand) {
reverse(mismatches);
reverse(insertions);
reverse(deletions);
}
return new CovariateValues(mismatches, insertions, deletions);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@ -98,21 +85,21 @@ public class ContextCovariate implements StandardCovariate {
}
@Override
public String keyFromBitSet(BitSet key) {
if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file
public String formatKey(final long key) {
if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file
return null;
return BitSetUtils.dnaFrom(key);
return contextFromKey(key);
}
@Override
public BitSet bitSetFromKey(Object key) {
return BitSetUtils.bitSetFrom((String) key);
public long longFromKey(Object key) {
return keyFromContext((String) key);
}
@Override
public int numberOfBits() {
return Long.bitCount(-1L);
return Integer.bitCount(Integer.MAX_VALUE);
}
/**
@ -121,29 +108,132 @@ public class ContextCovariate implements StandardCovariate {
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context
* @return the bitSet representing the Context
* @return the key representing the context
*/
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
BitSet result = null;
if (offset - contextSize + 1 >= 0) {
final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1);
if (!BaseUtils.containsBase(context, BaseUtils.N))
result = BitSetUtils.bitSetFrom(context);
}
private long contextWith(final byte[] bases, final int offset, final int contextSize) {
final int start = offset - contextSize + 1;
final long result;
if (start >= 0)
result = keyFromContext(bases, start, offset + 1);
else
result = -1L;
return result;
}
public static long keyFromContext(final String dna) {
return keyFromContext(dna.getBytes(), 0, dna.length());
}
/**
* Reverses the given array in place.
* Creates a long representation of a given dna string.
*
* @param array any array
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the key representing the dna sequence
*/
private static void reverse(final Object[] array) {
final int arrayLength = array.length;
for (int l = 0, r = arrayLength - 1; l < r; l++, r--) {
final Object temp = array[l];
array[l] = array[r];
array[r] = temp;
public static long keyFromContext(final byte[] dna, final int start, final int end) {
final long preContext = combinationsPerLength[end - start - 1]; // the sum of all combinations that preceded the length of the dna string
long baseTen = 0L; // the number in base_10 that we are going to use to generate the bit set
for (int i = start; i < end; i++) {
baseTen = (baseTen << 2); // multiply by 4
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]);
if (baseIndex == -1) // ignore non-ACGT bases
return -1L;
baseTen += (long)baseIndex;
}
return baseTen + preContext; // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
static {
for (int i = 0; i < MAX_DNA_CONTEXT + 1; i++)
computeCombinationsFor(i);
}
/**
* The sum of all combinations of a context of a given length from length = 0 to length.
*
* Memoized implementation of sum(4^i) , where i=[0,length]
*
* @param length the length of the DNA context
*/
private static void computeCombinationsFor(final int length) {
long combinations = 0L;
for (int i = 1; i <= length; i++)
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
combinationsPerLength[length] = combinations;
}
/**
* Converts a key into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param key the key representing the dna sequence
* @return the dna sequence represented by the key
*/
public static String contextFromKey(long key) {
if (key < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
final int length = contextLengthFor(key); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
key -= combinationsPerLength[length - 1]; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
StringBuilder dna = new StringBuilder();
while (key > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
final byte base = (byte) (key & 3); // equivalent to (key % 4)
dna.append((char)BaseUtils.baseIndexToSimpleBase(base));
key = key >> 2; // divide by 4
}
for (int j = dna.length(); j < length; j++)
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
}
/**
* Calculates the length of the DNA context for a given base 10 number
*
* It is important to know the length given the base 10 number to calculate the number of combinations
* and to disambiguate the "quasi-canonical" state.
*
* This method also calculates the number of combinations as a by-product, but since it memoizes the
* results, a subsequent call to combinationsFor(length) is O(1).
*
* @param number the base 10 representation of the key
* @return the length of the DNA context represented by this number
*/
private static int contextLengthFor(final long number) {
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
long combinations = combinationsPerLength[length]; // the next context (we advance it so we know which one was preceding it).
while (combinations <= number) { // find the length of the dna string (length)
length++;
combinations = combinationsPerLength[length]; // calculate the next context
}
return length;
}
}

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -45,15 +43,15 @@ public interface Covariate {
*
* @param RAC the recalibration argument collection
*/
public void initialize(RecalibrationArgumentCollection RAC);
public void initialize(final RecalibrationArgumentCollection RAC);
/**
* Calculates covariate values for all positions in the read.
*
* @param read the read to calculate the covariates on.
* @return all the covariate values for every base in the read.
* @param read the read to calculate the covariates on.
* @param values the object to record the covariate values for every base in the read.
*/
public CovariateValues getValues(GATKSAMRecord read);
public void recordValues(final GATKSAMRecord read, final ReadCovariates values);
/**
* Used to get the covariate's value from input csv file during on-the-fly recalibration
@ -61,26 +59,26 @@ public interface Covariate {
* @param str the key in string type (read from the csv)
* @return the key in it's correct type.
*/
public Object getValue(String str);
public Object getValue(final String str);
/**
* Converts the bitset representation of the key (used internally for table indexing) to String format for file output.
* Converts the internal representation of the key to String format for file output.
*
* @param key the bitset representation of the key
* @param key the long representation of the key
* @return a string representation of the key
*/
public String keyFromBitSet(BitSet key);
public String formatKey(final long key);
/**
* Converts a key into a bitset
* Converts an Object key into a long key using only the lowest numberOfBits() bits
*
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates
* the getValues method already returns all values in BitSet format.
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in long format. For counting covariates
* the getValues method already returns all values in long format.
*
* @param key the object corresponding to the covariate
* @return a bitset representation of the object
* @return a long representation of the object
*/
public BitSet bitSetFromKey(Object key);
public long longFromKey(final Object key);
/**
* Each covariate should determine how many bits are necessary to encode it's data

View File

@ -1,39 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import java.util.BitSet;
/**
* An object to hold the different covariate values for all bases in the read.
*
* Currently we have three different covariates for each read:
* - Mismatch
* - Insertion
* - Deletion
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class CovariateValues {
private final BitSet[] mismatches;
private final BitSet[] insertions;
private final BitSet[] deletions;
public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) {
this.mismatches = mismatch;
this.insertions = insertion;
this.deletions = deletion;
}
public BitSet[] getMismatches() {
return mismatches;
}
public BitSet[] getInsertions() {
return insertions;
}
public BitSet[] getDeletions() {
return deletions;
}
}

View File

@ -1,12 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
import java.util.EnumSet;
/*
@ -60,18 +58,18 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
BitSet[] cycles = new BitSet[read.getReadLength()];
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final int readLength = read.getReadLength();
final NGSPlatform ngsPlatform = read.getNGSPlatform();
// Discrete cycle platforms
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final short readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? (short) -1 : 1;
final short increment;
short cycle;
final int readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? -1 : 1;
final int increment;
int cycle;
if (read.getReadNegativeStrandFlag()) {
cycle = (short) (read.getReadLength() * readOrderFactor);
increment = (short) (-1 * readOrderFactor);
cycle = readLength * readOrderFactor;
increment = -1 * readOrderFactor;
}
else {
cycle = readOrderFactor;
@ -79,9 +77,10 @@ public class CycleCovariate implements StandardCovariate {
}
final int CUSHION = 4;
final int MAX_CYCLE = read.getReadLength() - CUSHION - 1;
for (int i = 0; i < MAX_CYCLE; i++) {
cycles[i] = (i<CUSHION || i>MAX_CYCLE) ? null : BitSetUtils.bitSetFrom(cycle);
final int MAX_CYCLE = readLength - CUSHION - 1;
for (int i = 0; i < readLength; i++) {
final long key = (i<CUSHION || i>MAX_CYCLE) ? -1L : keyFromCycle(cycle);
values.addCovariate(key, key, key, i);
cycle += increment;
}
}
@ -89,7 +88,6 @@ public class CycleCovariate implements StandardCovariate {
// Flow cycle platforms
else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final int readLength = read.getReadLength();
final byte[] bases = read.getReadBases();
// Differentiate between first and second of pair.
@ -100,7 +98,7 @@ public class CycleCovariate implements StandardCovariate {
// the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
@ -108,19 +106,23 @@ public class CycleCovariate implements StandardCovariate {
int iii = 0;
while (iii < readLength) {
while (iii < readLength && bases[iii] == (byte) 'T') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'A') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'C') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
while (iii < readLength && bases[iii] == (byte) 'G') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
if (iii < readLength) {
@ -130,7 +132,8 @@ public class CycleCovariate implements StandardCovariate {
cycle++;
}
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++;
}
@ -140,19 +143,23 @@ public class CycleCovariate implements StandardCovariate {
int iii = readLength - 1;
while (iii >= 0) {
while (iii >= 0 && bases[iii] == (byte) 'T') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'A') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'C') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
while (iii >= 0 && bases[iii] == (byte) 'G') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
if (iii >= 0) {
@ -162,7 +169,8 @@ public class CycleCovariate implements StandardCovariate {
cycle++;
}
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = BitSetUtils.bitSetFrom(cycle);
final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--;
}
}
@ -173,28 +181,38 @@ public class CycleCovariate implements StandardCovariate {
else {
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
}
return new CovariateValues(cycles, cycles, cycles);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
public final Object getValue(final String str) {
return Short.parseShort(str);
return Integer.parseInt(str);
}
@Override
public String keyFromBitSet(BitSet key) {
return String.format("%d", BitSetUtils.shortFrom(key));
public String formatKey(final long key) {
long cycle = key >> 1; // shift so we can remove the "sign" bit
if ( (key & 1) != 0 ) // is the last bit set?
cycle *= -1; // then the cycle is negative
return String.format("%d", cycle);
}
@Override
public BitSet bitSetFromKey(Object key) {
return (key instanceof String) ? BitSetUtils.bitSetFrom(Short.parseShort((String) key)) : BitSetUtils.bitSetFrom((Short) key);
public long longFromKey(final Object key) {
return (key instanceof String) ? keyFromCycle(Integer.parseInt((String) key)) : keyFromCycle((Integer) key);
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative
return Integer.bitCount(Integer.MAX_VALUE);
}
private static long keyFromCycle(final int cycle) {
// no negative values because values must fit into the first few bits of the long
long result = Math.abs(cycle);
result = result << 1; // shift so we can add the "sign" bit
if ( cycle < 0 )
result++; // negative cycles get the lower-most bit set
return result;
}
}

View File

@ -1,11 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -43,28 +40,17 @@ public class QualityScoreCovariate implements RequiredCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
}
public void initialize(final RecalibrationArgumentCollection RAC) {}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int readLength = read.getReadLength();
BitSet[] mismatches = new BitSet[readLength];
BitSet[] insertions = new BitSet[readLength];
BitSet[] deletions = new BitSet[readLength];
byte[] baseQualities = read.getBaseQualities();
byte[] baseInsertionQualities = read.getBaseInsertionQualities();
byte[] baseDeletionQualities = read.getBaseDeletionQualities();
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final byte[] baseQualities = read.getBaseQualities();
final byte[] baseInsertionQualities = read.getBaseInsertionQualities();
final byte[] baseDeletionQualities = read.getBaseDeletionQualities();
for (int i = 0; i < baseQualities.length; i++) {
mismatches[i] = BitSetUtils.bitSetFrom(baseQualities[i]);
insertions[i] = BitSetUtils.bitSetFrom(baseInsertionQualities[i]);
deletions[i] = BitSetUtils.bitSetFrom(baseDeletionQualities[i]);
values.addCovariate((long)baseQualities[i], (long)baseInsertionQualities[i], (long)baseDeletionQualities[i], i);
}
return new CovariateValues(mismatches, insertions, deletions);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@ -74,17 +60,17 @@ public class QualityScoreCovariate implements RequiredCovariate {
}
@Override
public String keyFromBitSet(BitSet key) {
return String.format("%d", BitSetUtils.longFrom(key));
public String formatKey(final long key) {
return String.format("%d", key);
}
@Override
public BitSet bitSetFromKey(Object key) {
return (key instanceof String) ? BitSetUtils.bitSetFrom(Byte.parseByte((String) key)) : BitSetUtils.bitSetFrom((Byte) key);
public long longFromKey(final Object key) {
return (key instanceof String) ? (long)Byte.parseByte((String) key) : (long)(Byte) key;
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE);
return BQSRKeyManager.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE);
}
}

View File

@ -6,7 +6,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Map;
@ -31,15 +30,15 @@ public class QuantizationInfo {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
}
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
public QuantizationInfo(Map<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
Map<BitSet, RecalDatum> qualTable = null; // look for the quality score table
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
Map<Long, RecalDatum> qualTable = null; // look for the quality score table
for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites
if (keyManager.getNumRequiredCovariates() == 2) // it should be the only one with 2 required covariates
qualTable = entry.getValue();
}

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.BitSet;
/**
* The object temporarily held by a read that describes all of it's covariates.
*
@ -13,27 +11,29 @@ import java.util.BitSet;
* @since 2/8/12
*/
public class ReadCovariates {
private final BitSet[][] mismatchesKeySet;
private final BitSet[][] insertionsKeySet;
private final BitSet[][] deletionsKeySet;
private final long[][] mismatchesKeySet;
private final long[][] insertionsKeySet;
private final long[][] deletionsKeySet;
private int nextCovariateIndex;
private int currentCovariateIndex = 0;
public ReadCovariates(int readLength, int numberOfCovariates) {
this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates];
this.insertionsKeySet = new BitSet[readLength][numberOfCovariates];
this.deletionsKeySet = new BitSet[readLength][numberOfCovariates];
this.nextCovariateIndex = 0;
this.mismatchesKeySet = new long[readLength][numberOfCovariates];
this.insertionsKeySet = new long[readLength][numberOfCovariates];
this.deletionsKeySet = new long[readLength][numberOfCovariates];
}
public void addCovariate(CovariateValues covariate) {
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches());
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
nextCovariateIndex++;
public void setCovariateIndex(final int index) {
currentCovariateIndex = index;
}
public BitSet[] getKeySet(final int readPosition, final EventType errorModel) {
public void addCovariate(final long mismatch, final long insertion, final long deletion, final int readOffset) {
mismatchesKeySet[readOffset][currentCovariateIndex] = mismatch;
insertionsKeySet[readOffset][currentCovariateIndex] = insertion;
deletionsKeySet[readOffset][currentCovariateIndex] = deletion;
}
public long[] getKeySet(final int readPosition, final EventType errorModel) {
switch (errorModel) {
case BASE_SUBSTITUTION:
return getMismatchesKeySet(readPosition);
@ -46,35 +46,30 @@ public class ReadCovariates {
}
}
public BitSet[] getMismatchesKeySet(int readPosition) {
public long[] getMismatchesKeySet(final int readPosition) {
return mismatchesKeySet[readPosition];
}
public BitSet[] getInsertionsKeySet(int readPosition) {
public long[] getInsertionsKeySet(final int readPosition) {
return insertionsKeySet[readPosition];
}
public BitSet[] getDeletionsKeySet(int readPosition) {
public long[] getDeletionsKeySet(final int readPosition) {
return deletionsKeySet[readPosition];
}
private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) {
for (int i = 0; i < covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i];
}
/**
* Testing routines
*/
protected BitSet[][] getMismatchesKeySet() {
protected long[][] getMismatchesKeySet() {
return mismatchesKeySet;
}
protected BitSet[][] getInsertionsKeySet() {
protected long[][] getInsertionsKeySet() {
return insertionsKeySet;
}
protected BitSet[][] getDeletionsKeySet() {
protected long[][] getDeletionsKeySet() {
return deletionsKeySet;
}
}

View File

@ -1,11 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
/*
@ -43,23 +40,22 @@ import java.util.HashMap;
public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>();
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>();
private short nextId = 0;
private final HashMap<String, Long> readGroupLookupTable = new HashMap<String, Long>();
private final HashMap<Long, String> readGroupReverseLookupTable = new HashMap<Long, String>();
private long nextId = 0L;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
}
public void initialize(final RecalibrationArgumentCollection RAC) {}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
final int l = read.getReadLength();
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final String readGroupId = readGroupValueFromRG(read.getReadGroup());
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset
BitSet[] readGroups = new BitSet[l];
Arrays.fill(readGroups, rg);
return new CovariateValues(readGroups, readGroups, readGroups);
final long key = keyForReadGroup(readGroupId);
final int l = read.getReadLength();
for (int i = 0; i < l; i++)
values.addCovariate(key, key, key, i);
}
@Override
@ -68,35 +64,28 @@ public class ReadGroupCovariate implements RequiredCovariate {
}
@Override
public String keyFromBitSet(BitSet key) {
return decodeReadGroup((short) BitSetUtils.longFrom(key));
public String formatKey(final long key) {
return readGroupReverseLookupTable.get(key);
}
@Override
public BitSet bitSetFromKey(Object key) {
return bitSetForReadGroup((String) key);
public long longFromKey(Object key) {
return keyForReadGroup((String) key);
}
@Override
public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE);
return BQSRKeyManager.numberOfBitsToRepresent(Short.MAX_VALUE);
}
private String decodeReadGroup(final short id) {
return readGroupReverseLookupTable.get(id);
}
private BitSet bitSetForReadGroup(String readGroupId) {
short shortId;
if (readGroupLookupTable.containsKey(readGroupId))
shortId = readGroupLookupTable.get(readGroupId);
else {
shortId = nextId;
private long keyForReadGroup(final String readGroupId) {
if (!readGroupLookupTable.containsKey(readGroupId)) {
readGroupLookupTable.put(readGroupId, nextId);
readGroupReverseLookupTable.put(nextId, readGroupId);
nextId++;
}
return BitSetUtils.bitSetFrom(shortId);
}
return readGroupLookupTable.get(readGroupId);
}
/**
@ -105,8 +94,8 @@ public class ReadGroupCovariate implements RequiredCovariate {
* @param rg the read group record
* @return platform unit or readgroup id
*/
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) {
String platformUnit = rg.getPlatformUnit();
private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) {
final String platformUnit = rg.getPlatformUnit();
return platformUnit == null ? rg.getId() : platformUnit;
}

View File

@ -149,17 +149,17 @@ public class RecalDataManager {
* @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized
*/
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
public static LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>>();
final ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
final ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(); // initializing a new recal table for each required covariate (cumulatively)
final Map<Long, RecalDatum> recalTable = new HashMap<Long, RecalDatum>(); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
}
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final Map<Long, RecalDatum> recalTable = new HashMap<Long, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap;
@ -181,7 +181,7 @@ public class RecalDataManager {
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
final ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>();
if (argumentCollection.USE_STANDARD_COVARIATES)
optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user
@ -223,7 +223,7 @@ public class RecalDataManager {
logger.info("");
}
private static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) {
private static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap) {
List<GATKReportTable> result = new LinkedList<GATKReportTable>();
int tableIndex = 0;
@ -235,23 +235,23 @@ public class RecalDataManager {
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey();
Map<BitSet, RecalDatum> recalTable = entry.getValue();
for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
final BQSRKeyManager keyManager = entry.getKey();
final Map<Long, RecalDatum> recalTable = entry.getValue();
boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs.
final boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs.
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
final Covariate[] requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
final Covariate[] optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
final ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
for (Covariate covariate : requiredList) {
String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
for (final Covariate covariate : requiredList) {
final String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future
}
if (optionalList.size() > 0) {
if (optionalList.length > 0) {
columnNames.add(covariateValue);
columnNames.add(covariateName);
}
@ -263,30 +263,30 @@ public class RecalDataManager {
columnNames.add(nObservations);
columnNames.add(nErrors);
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "", columnNames.size());
for (Pair<String, String> columnName : columnNames)
final GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "", columnNames.size());
for (final Pair<String, String> columnName : columnNames)
reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); // every table must have the event type
int rowIndex = 0;
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
BitSet bitSetKey = recalTableEntry.getKey();
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
Iterator<Pair<String, String>> iterator = columnNames.iterator();
for (Object key : keyManager.keySetFrom(bitSetKey)) {
String columnName = iterator.next().getFirst();
for (Map.Entry<Long, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
final Long bitSetKey = recalTableEntry.getKey();
final Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
final Iterator<Pair<String, String>> iterator = columnNames.iterator();
for (final Object key : keyManager.keySetFrom(bitSetKey)) {
final String columnName = iterator.next().getFirst();
columnData.put(columnName, key);
}
RecalDatum datum = recalTableEntry.getValue();
final RecalDatum datum = recalTableEntry.getValue();
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality());
if (isReadGroupTable)
columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
columnData.put(iterator.next().getFirst(), datum.numObservations);
columnData.put(iterator.next().getFirst(), datum.numMismatches);
for (Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
String columnName = dataEntry.getKey();
Object value = dataEntry.getValue();
for (final Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
final String columnName = dataEntry.getKey();
final Object value = dataEntry.getValue();
reportTable.set(rowIndex, columnName, value.toString());
}
rowIndex++;
@ -296,16 +296,16 @@ public class RecalDataManager {
return result;
}
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<Long, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
}
private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) {
GATKReport report = new GATKReport();
final GATKReport report = new GATKReport();
report.addTable(argumentTable);
report.addTable(quantizationTable);
report.addTables(recalTables);
@ -328,7 +328,7 @@ public class RecalDataManager {
final File plotFileName = new File(csvFileName + ".pdf");
files.getFirst().close();
RScriptExecutor executor = new RScriptExecutor();
final RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class));
executor.addArgs(csvFileName.getAbsolutePath());
executor.addArgs(plotFileName.getAbsolutePath());
@ -340,34 +340,34 @@ public class RecalDataManager {
}
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> original, boolean keepIntermediates) {
final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), original, "ORIGINAL", true);
outputRecalibrationPlot(files, keepIntermediates);
}
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> recalibrated, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> recalibrated, boolean keepIntermediates) {
final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true);
writeCSV(files.getFirst(), original, "ORIGINAL", false);
outputRecalibrationPlot(files, keepIntermediates);
}
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
final int QUALITY_SCORE_COVARIATE_INDEX = 1;
final Map<BitSet, RecalDatum> deltaTable = new HashMap<BitSet, RecalDatum>();
final Map<Long, RecalDatum> deltaTable = new HashMap<Long, RecalDatum>();
BQSRKeyManager deltaKeyManager = null;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> tableEntry : map.entrySet()) {
final BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getOptionalCovariates().size() > 0) { // initialize with the 'all covariates' table
if (keyManager.getNumOptionalCovariates() > 0) { // initialize with the 'all covariates' table
// create a key manager for the delta table
final List<Covariate> requiredCovariates = Arrays.asList(keyManager.getRequiredCovariates().get(0)); // include the read group covariate as the only required covariate
List<Covariate> optionalCovariates = new ArrayList<Covariate>();
optionalCovariates.add(keyManager.getRequiredCovariates().get(1)); // include the quality score covariate as an optional covariate
optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates
final List<Covariate> requiredCovariates = Arrays.asList(keyManager.getRequiredCovariates()[0]); // include the read group covariate as the only required covariate
final List<Covariate> optionalCovariates = new ArrayList<Covariate>();
optionalCovariates.add(keyManager.getRequiredCovariates()[1]); // include the quality score covariate as an optional covariate
optionalCovariates.addAll(Arrays.asList(keyManager.getOptionalCovariates())); // include all optional covariates
deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
}
}
@ -376,37 +376,37 @@ public class RecalDataManager {
throw new ReviewedStingException ("Couldn't find the covariates table");
boolean readyToPrint = false;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey();
for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> tableEntry : map.entrySet()) {
final BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getRequiredCovariates().size() == 2 && keyManager.getOptionalCovariates().isEmpty()) { // look for the QualityScore table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
if (keyManager.getNumRequiredCovariates() == 2 && keyManager.getNumOptionalCovariates() == 0) { // look for the QualityScore table
final Map<Long, RecalDatum> table = tableEntry.getValue();
// add the quality score table to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
for (final Map.Entry<Long, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
List<Object> newCovs = new ArrayList<Object>(4);
final List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
final List<Object> newCovs = new ArrayList<Object>(4);
newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score
newCovs.add(1, covs.get(1));
newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
newCovs.add(3, covs.get(2));
BitSet deltaKey = deltaKeyManager.bitSetFromKey(newCovs.toArray()); // create a new bitset key for the delta table
final long deltaKey = deltaKeyManager.longFromKey(newCovs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
}
}
else if (keyManager.getOptionalCovariates().size() > 0) { // look for the optional covariates table
Map<BitSet, RecalDatum> table = tableEntry.getValue();
else if (keyManager.getNumOptionalCovariates() > 0) { // look for the optional covariates table
final Map<Long, RecalDatum> table = tableEntry.getValue();
// add the optional covariates to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
for (final Map.Entry<Long, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
final List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table
final long deltaKey = deltaKeyManager.longFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
}
readyToPrint = true;
@ -416,7 +416,7 @@ public class RecalDataManager {
if (readyToPrint) {
if (printHeader) {
List<String> header = new LinkedList<String>();
final List<String> header = new LinkedList<String>();
header.add("ReadGroup");
header.add("CovariateValue");
header.add("CovariateName");
@ -431,9 +431,9 @@ public class RecalDataManager {
}
// print each data line
for(Map.Entry<BitSet, RecalDatum> deltaEntry : deltaTable.entrySet()) {
List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
RecalDatum deltaDatum = deltaEntry.getValue();
for (final Map.Entry<Long, RecalDatum> deltaEntry : deltaTable.entrySet()) {
final List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
final RecalDatum deltaDatum = deltaEntry.getValue();
deltaTableFile.print(Utils.join(",", deltaKeys));
deltaTableFile.print("," + deltaDatum.stringForCSV());
deltaTableFile.println("," + recalibrationMode);
@ -453,8 +453,8 @@ public class RecalDataManager {
* @param deltaKey the key to the table
* @param recalDatum the recal datum to combine with the accuracyDatum element in the table
*/
private static void addToDeltaTable(Map<BitSet, RecalDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum) {
RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
private static void addToDeltaTable(Map<Long, RecalDatum> deltaTable, Long deltaKey, RecalDatum recalDatum) {
final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
if (deltaDatum == null)
deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum
else
@ -611,18 +611,32 @@ public class RecalDataManager {
* @param requestedCovariates The list of requested covariates.
* @return a matrix with all the covariates calculated for every base in the read
*/
public static ReadCovariates computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) {
final int numRequestedCovariates = requestedCovariates.size();
final int readLength = read.getReadLength();
final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates);
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (Covariate covariate : requestedCovariates)
readCovariates.addCovariate(covariate.getValues(read));
public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) {
final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length);
computeCovariates(read, requestedCovariates, readCovariates);
return readCovariates;
}
/**
* Computes all requested covariates for every offset in the given read
* by calling covariate.getValues(..).
*
* It populates an array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*
* @param read The read for which to compute covariate values.
* @param requestedCovariates The list of requested covariates.
* @param readCovariates The object to store the covariate values
*/
public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates readCovariates) {
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (int i = 0; i < requestedCovariates.length; i++) {
readCovariates.setCovariateIndex(i);
requestedCovariates[i].recordValues(read, readCovariates);
}
}
/**
* Perform a certain transversion (A <-> C or G <-> T) on the base.
*

View File

@ -114,16 +114,10 @@ public class RecalibrationArgumentCollection {
public int MISMATCHES_CONTEXT_SIZE = 2;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base insertions
* The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
*/
@Argument(fullName = "insertions_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions", required = false)
public int INSERTIONS_CONTEXT_SIZE = 8;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base deletions
*/
@Argument(fullName = "deletions_context_size", shortName = "dcs", doc = "size of the k-mer context to be used for base deletions", required = false)
public int DELETIONS_CONTEXT_SIZE = 8;
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
public int INDELS_CONTEXT_SIZE = 8;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
@ -188,10 +182,8 @@ public class RecalibrationArgumentCollection {
argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.addRowID("mismatches_context_size", true);
argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.addRowID("insertions_context_size", true);
argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE);
argumentsTable.addRowID("deletions_context_size", true);
argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE);
argumentsTable.addRowID("indels_context_size", true);
argumentsTable.set("indels_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
argumentsTable.addRowID("mismatches_default_quality", true);
argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.addRowID("insertions_default_quality", true);

View File

@ -18,8 +18,8 @@ import java.util.*;
*/
public class RecalibrationReport {
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
private final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private final LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation
private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter
@ -36,21 +36,25 @@ public class RecalibrationReport {
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates
requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
int covariateIndex = 0;
for (final Covariate covariate : requiredCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (final Covariate covariate : optionalCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
final Map<Long, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
final int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check.";
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
@ -69,15 +73,16 @@ public class RecalibrationReport {
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
final Map<Long, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table);
}
protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) {
protected RecalibrationReport(final QuantizationInfo quantizationInfo, final LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) {
this.quantizationInfo = quantizationInfo;
this.keysAndTablesMap = keysAndTablesMap;
this.argumentTable = argumentTable;
this.RAC = RAC;
this.requestedCovariates = null;
}
/**
@ -94,25 +99,25 @@ public class RecalibrationReport {
* @param other the recalibration report to combine with this one
*/
public void combine(RecalibrationReport other) {
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
Iterator<Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> thisEntry = thisIterator.next();
for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) {
Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> thisEntry = thisIterator.next();
Map<BitSet, RecalDatum> thisTable = thisEntry.getValue();
BQSRKeyManager thisKeyManager = thisEntry.getKey();
BQSRKeyManager otherKeyManager = otherEntry.getKey();
final Map<Long, RecalDatum> thisTable = thisEntry.getValue();
final BQSRKeyManager thisKeyManager = thisEntry.getKey();
final BQSRKeyManager otherKeyManager = otherEntry.getKey();
for (Map.Entry<BitSet, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) {
RecalDatum otherDatum = otherTableEntry.getValue();
BitSet otherBitKey = otherTableEntry.getKey();
List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey);
for (Map.Entry<Long, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) {
final RecalDatum otherDatum = otherTableEntry.getValue();
final Long otherBitKey = otherTableEntry.getKey();
final List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey);
BitSet thisBitKey = thisKeyManager.bitSetFromKey(otherObjectKey.toArray());
RecalDatum thisDatum = thisTable.get(thisBitKey);
final long thisKey = thisKeyManager.longFromKey(otherObjectKey.toArray());
final RecalDatum thisDatum = thisTable.get(thisKey);
if (thisDatum == null)
thisTable.put(thisBitKey, otherDatum);
thisTable.put(thisKey, otherDatum);
else
thisDatum.combine(otherDatum);
}
@ -123,11 +128,11 @@ public class RecalibrationReport {
return quantizationInfo;
}
public LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> getKeysAndTablesMap() {
public LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> getKeysAndTablesMap() {
return keysAndTablesMap;
}
public ArrayList<Covariate> getRequestedCovariates() {
public Covariate[] getRequestedCovariates() {
return requestedCovariates;
}
@ -138,7 +143,7 @@ public class RecalibrationReport {
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
private Map<Long, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -155,7 +160,7 @@ public class RecalibrationReport {
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
private Map<Long, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -170,7 +175,7 @@ public class RecalibrationReport {
* @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
private Map<Long, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
@ -185,26 +190,26 @@ public class RecalibrationReport {
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) {
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);
private Map<Long, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) {
final Map<Long, RecalDatum> result = new HashMap<Long, RecalDatum>(reportTable.getNumRows()*2);
for ( int i = 0; i < reportTable.getNumRows(); i++ ) {
int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys];
final int nKeys = columnNamesOrderedList.size();
final Object [] keySet = new Object[nKeys];
for (int j = 0; j < nKeys; j++)
keySet[j] = reportTable.get(i, columnNamesOrderedList.get(j)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[j] = reportTable.get(i, columnNamesOrderedList.get(j)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet);
final long bitKey = keyManager.longFromKey(keySet);
long nObservations = (Long) reportTable.get(i, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
long nErrors = (Long) reportTable.get(i, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(i, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
final long nObservations = (Long) reportTable.get(i, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
final long nErrors = (Long) reportTable.get(i, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
final double empiricalQuality = (Double) reportTable.get(i, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table
(Double) reportTable.get(i, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table
Byte.parseByte((String) reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table
final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table
(Double) reportTable.get(i, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table
Byte.parseByte((String) reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
final RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
result.put(bitKey, recalDatum);
}
return result;
@ -217,14 +222,14 @@ public class RecalibrationReport {
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
final Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
final Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
for ( int i = 0; i < table.getNumRows(); i++ ) {
byte originalQual = (byte)i;
Object quantizedObject = table.get(i, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
Object countObject = table.get(i, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
byte quantizedQual = Byte.parseByte(quantizedObject.toString());
long quantizedCount = Long.parseLong(countObject.toString());
final byte originalQual = (byte)i;
final Object quantizedObject = table.get(i, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
final Object countObject = table.get(i, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
final byte quantizedQual = Byte.parseByte(quantizedObject.toString());
final long quantizedCount = Long.parseLong(countObject.toString());
quals[originalQual] = quantizedQual;
counts[originalQual] = quantizedCount;
}
@ -238,7 +243,7 @@ public class RecalibrationReport {
* @return a RAC object properly initialized with all the objects in the table
*/
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for ( int i = 0; i < table.getNumRows(); i++ ) {
final String argument = table.get(i, "Argument").toString();
@ -261,11 +266,8 @@ public class RecalibrationReport {
else if (argument.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("insertions_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("indels_context_size"))
RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
@ -306,7 +308,7 @@ public class RecalibrationReport {
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/
public void calculateEmpiricalAndQuantizedQualities() {
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values())
for (Map<Long, RecalDatum> table : keysAndTablesMap.values())
for (RecalDatum datum : table.values())
datum.calcCombinedEmpiricalQuality();
@ -331,26 +333,26 @@ public class RecalibrationReport {
return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap);
}
private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t2) {
private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> t2) {
if (t1.size() != t2.size())
return false;
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t1Iterator = t1.entrySet().iterator();
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t2Iterator = t2.entrySet().iterator();
final Iterator<Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>>> t1Iterator = t1.entrySet().iterator();
final Iterator<Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>>> t2Iterator = t2.entrySet().iterator();
while (t1Iterator.hasNext() && t2Iterator.hasNext()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t1MapEntry = t1Iterator.next();
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t2MapEntry = t2Iterator.next();
Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> t1MapEntry = t1Iterator.next();
Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> t2MapEntry = t2Iterator.next();
if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey())))
return false;
Map<BitSet, RecalDatum> table2 = t2MapEntry.getValue();
for (Map.Entry<BitSet, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) {
BitSet t1Key = t1TableEntry.getKey();
final Map<Long, RecalDatum> table2 = t2MapEntry.getValue();
for (Map.Entry<Long, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) {
final Long t1Key = t1TableEntry.getKey();
if (!table2.containsKey(t1Key))
return false;
RecalDatum t1Datum = t1TableEntry.getValue();
final RecalDatum t1Datum = t1TableEntry.getValue();
if (!t1Datum.equals(table2.get(t1Key)))
return false;
}

View File

@ -33,12 +33,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import java.util.*;
@ -147,7 +145,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header
vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header
}
@Override
@ -249,6 +247,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) {
GenomeLoc interval = stats.getInterval();
List<Allele> alleles = new ArrayList<Allele>();
Map<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
@ -258,73 +257,46 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
vcb.filters(statusesToStrings(stats.callableStatuses(thresholds)));
vcb.filters(new HashSet<String>(statusesToStrings(stats.callableStatuses(thresholds))));
attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage());
vcb = vcb.attributes(attributes);
for (String sample : samples) {
Map<String, Object> infos = new HashMap<String, Object>();
SampleStatistics sampleStat = stats.getSample(sample);
infos.put(VCFConstants.DEPTH_KEY, sampleStat.averageCoverage());
infos.put("Q1", sampleStat.getQuantileDepth(0.25));
infos.put("MED", sampleStat.getQuantileDepth(0.50));
infos.put("Q3", sampleStat.getQuantileDepth(0.75));
Set<String> filters = new HashSet<String>();
filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false));
}
vcb = vcb.genotypes(genotypes);
if (debug) {
System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage());
}
for (String sample : samples) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
SampleStatistics sampleStat = stats.getSample(sample);
gb.DP((int)sampleStat.averageCoverage());
gb.attribute("Q1", sampleStat.getQuantileDepth(0.25));
gb.attribute("MED", sampleStat.getQuantileDepth(0.50));
gb.attribute("Q3", sampleStat.getQuantileDepth(0.75));
if (debug) {
System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads());
}
gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(gb.make());
}
vcb = vcb.genotypes(genotypes);
vcfWriter.add(vcb.make());
}
/**
* Gets the header lines for the VCF writer
*
* @return A set of VCF header lines
*/
private static Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
// INFO fields for overall data
headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
// todo -- find the appropriate VCF constants
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));
return headerLines;
}
/**
* Function that process a set of statuses into strings
*
* @param statuses the set of statuses to be converted
* @return a matching set of strings
*/
private Set<String> statusesToStrings(Set<CallableStatus> statuses) {
Set<String> output = new HashSet<String>(statuses.size());
private List<String> statusesToStrings(Set<CallableStatus> statuses) {
List<String> output = new ArrayList<String>(statuses.size());
for (CallableStatus status : statuses)
output.add(status.name());
@ -333,6 +305,6 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
}
private IntervalStatistics createIntervalStatistic(GenomeLoc interval) {
return new IntervalStatistics(samples, interval /*, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality*/);
return new IntervalStatistics(samples, interval);
}
}

View File

@ -0,0 +1,84 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import java.io.PrintStream;
@PartitionBy(PartitionType.CONTIG)
@ActiveRegionExtension(extension = 0, maxRegion = 50000)
public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
@Output(required = true)
private PrintStream out;
@Override
// Look to see if the region has sufficient coverage
public double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
// note the linear probability scale
int coverageThreshold = 20;
return Math.min((double) depth / coverageThreshold, 1);
}
@Override
public GenomeLoc map(final ActiveRegion activeRegion, final RefMetaDataTracker tracker) {
if (activeRegion.isActive)
return activeRegion.getLocation();
else
return null;
}
@Override
public Long reduceInit() {
return 0L;
}
@Override
public Long reduce(final GenomeLoc value, Long reduce) {
if (value != null) {
out.println(value.toString());
return reduce++;
} else
return reduce;
}
@Override
public void onTraversalDone(Long reduce) {
logger.info(String.format("Found %d intervals", reduce));
}
}

View File

@ -79,14 +79,12 @@ class SampleStatistics {
* @return the callable statuses of the entire sample
*/
public Set<CallableStatus> getCallableStatuses(ThresHolder thresholds) {
Set<CallableStatus> output = new HashSet<CallableStatus>();
// We check if reads are present ot prevent div / 0 exceptions
if (nReads == 0) {
output.add(CallableStatus.NO_READS);
return output;
return Collections.singleton(CallableStatus.NO_READS);
}
Set<CallableStatus> output = new HashSet<CallableStatus>();
Map<CallableStatus, Double> totals = new HashMap<CallableStatus, Double>(CallableStatus.values().length);
// initialize map
@ -104,19 +102,19 @@ class SampleStatistics {
double intervalSize = interval.size();
if ((nBadMates / nReads) > thresholds.getBadMateStatusThreshold())
if (((double) nBadMates / nReads) >= thresholds.getBadMateStatusThreshold())
output.add(CallableStatus.BAD_MATE);
if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) > thresholds.getCoverageStatusThreshold())
if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) >= thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.COVERAGE_GAPS);
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > thresholds.getCoverageStatusThreshold())
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) >= thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.LOW_COVERAGE);
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > thresholds.getExcessiveCoverageThreshold())
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) >= thresholds.getExcessiveCoverageThreshold())
output.add(CallableStatus.EXCESSIVE_COVERAGE);
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > thresholds.getQualityStatusThreshold())
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold())
output.add(CallableStatus.POOR_QUALITY);
if (totals.get(CallableStatus.REF_N) > 0)
@ -126,6 +124,7 @@ class SampleStatistics {
if (output.isEmpty()) {
output.add(CallableStatus.PASS);
}
return output;
}
@ -146,7 +145,7 @@ class SampleStatistics {
int locusIndex = locus.getStart() - interval.getStart();
int rawCoverage = pileup.depthOfCoverage();
int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.getMinimumBaseQuality(), thresholds.getMinimumMappingQuality()).depthOfCoverage();
int coverage = thresholds.getFilteredCoverage(pileup);
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
@ -161,7 +160,7 @@ class SampleStatistics {
// Was this read already processed?
if (read.getTemporaryAttribute("checkedBadMate") == null) {
nReads++;
if (hasValidMate(read, thresholds))
if (!hasValidMate(read, thresholds))
nBadMates++;
read.setTemporaryAttribute("checkedBadMate", true);
}
@ -254,7 +253,7 @@ class SampleStatistics {
* reasonable insert size?
* inverted?
* same orientation?
* todo - same contig?
* same contig?
* is pair mapped?
* todo - is forced mate?
*
@ -264,6 +263,10 @@ class SampleStatistics {
if (!read.getReadPairedFlag())
return false;
// different contigs
if (read.getMateReferenceIndex() != read.getReferenceIndex())
return false;
// unmapped
if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag())
return false;
@ -277,10 +280,19 @@ class SampleStatistics {
read.getAlignmentStart() < read.getMateAlignmentStart())
return false;
// TODO note: IGV uses a different alorithm for insert size, there should be a common util class that does this for you
// mates are too far apart
if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize())
return false;
return true;
}
public int getnReads() {
return nReads;
}
public int getnBadMates() {
return nBadMates;
}
}

View File

@ -24,6 +24,12 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.HashSet;
import java.util.Set;
class ThresHolder {
public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
@ -69,14 +75,6 @@ class ThresHolder {
this.qualityStatusThreshold = qualityStatusThreshold;
}
public int getMinimumBaseQuality() {
return minimumBaseQuality;
}
public int getMinimumMappingQuality() {
return minimumMappingQuality;
}
public int getMinimumCoverage() {
return minimumCoverage;
}
@ -116,4 +114,37 @@ class ThresHolder {
public double getQualityStatusThreshold() {
return qualityStatusThreshold;
}
public int getFilteredCoverage(ReadBackedPileup pileup) {
return pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
}
/**
* Gets the header lines for the VCF writer
*
* @return A set of VCF header lines
*/
public static Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
// INFO fields for overall data
headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
// todo -- find the appropriate VCF constants
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));
return headerLines;
}
}

View File

@ -55,8 +55,6 @@ public class BAMDiffableReader implements DiffableReader {
int count = 0;
while ( iterator.hasNext() ) {
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
break;
final SAMRecord record = iterator.next();
// name is the read name + first of pair
@ -88,6 +86,9 @@ public class BAMDiffableReader implements DiffableReader {
if ( ! root.hasElement(name) )
// protect ourselves from malformed files
root.add(readRoot);
count += readRoot.size();
if ( count > maxElementsToRead && maxElementsToRead != -1)
break;
}
reader.close();

View File

@ -147,7 +147,7 @@ public class DiffEngine {
* @param diffs the list of differences to summarize
*/
public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params );
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params );
}
final protected static String[] diffNameToPath(String diffName) {
@ -161,9 +161,17 @@ public class DiffEngine {
diffs.add(new Difference(diff));
}
return summarizedDifferencesOfPaths(diffs, -1);
return summarizedDifferencesOfPaths(diffs, true, -1);
}
/**
* Computes a minimum set of potential differences between all singleton differences
* in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm.
*
* @param singletonDiffs
* @param maxRawDiffsToSummarize
* @return
*/
private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>();
@ -191,9 +199,41 @@ public class DiffEngine {
return summaries;
}
/**
* Computes the possible leaf differences among the singleton diffs.
*
* The leaf differences are all of the form *.*...*.X where all internal
* differences are wildcards and the only summarized difference considered
* interesting to compute is
*
* @param singletonDiffs
* @param maxRawDiffsToSummarize
* @return
*/
private Map<String, Difference> initialLeafSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>();
// create the initial set of differences
for ( final Difference d : singletonDiffs ) {
final String path = summarizedPath(d.getParts(), 1);
Difference sumDiff = new Difference(path, d.getMaster(), d.getTest());
sumDiff.setCount(0);
addSummaryIfMissing(summaries, sumDiff);
if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
return summaries;
}
return summaries;
}
protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
final boolean doPairwise,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize);
final Map<String, Difference> summaries = doPairwise
? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize)
: initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize);
// count differences
for ( Difference diffPath : singletonDiffs ) {
@ -372,18 +412,21 @@ public class DiffEngine {
final int maxCountOneItems;
final int minSumDiffToShow;
final int maxRawDiffsToSummarize;
final boolean doPairwise;
boolean descending = true;
public SummaryReportParams(PrintStream out,
int maxItemsToDisplay,
int maxCountOneItems,
int minSumDiffToShow,
int maxRawDiffsToSummarize) {
int maxRawDiffsToSummarize,
final boolean doPairwise) {
this.out = out;
this.maxItemsToDisplay = maxItemsToDisplay;
this.maxCountOneItems = maxCountOneItems;
this.minSumDiffToShow = minSumDiffToShow;
this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
this.doPairwise = doPairwise;
}
public void setDescending(boolean descending) {

View File

@ -111,21 +111,21 @@ import java.util.List;
* <p>
*
* <pre>
[testng] path count
[testng] *.*.*.AC 6
[testng] *.*.*.AF 6
[testng] *.*.*.AN 6
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1
</pre>
[testng] path count
[testng] *.*.*.AC 6
[testng] *.*.*.AF 6
[testng] *.*.*.AN 6
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1
</pre>
*
* @author Mark DePristo
* @since 7/4/11
@ -165,6 +165,8 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
int maxRawDiffsToSummary = -1;
@Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false)
boolean doPairwise = false;
/**
* The max number of differences to display when summarizing. For example, if there are 10M differences, but
@ -199,11 +201,14 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false)
boolean showItemizedDifferences = false;
@Argument(fullName="iterations", doc="Number of iterations to perform, should be 1 unless you are doing memory testing", required=false)
int iterations = 1;
DiffEngine diffEngine;
@Override
public void initialize() {
this.diffEngine = new DiffEngine();
this.diffEngine = new DiffEngine();
}
@Override
@ -223,29 +228,39 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Override
public void onTraversalDone(Integer sum) {
//out.printf("Reading master file %s%n", masterFile);
DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", master.size()));
//out.printf("Reading test file %s%n", testFile);
DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", test.size()));
if ( iterations > 1 ) {
for ( int i = 0; i < iterations; i++ ) {
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, 20, 10, 0, -1, false);
boolean success = DiffEngine.simpleDiffFiles(masterFile, testFile, MAX_OBJECTS_TO_READ, params);
logger.info("Iteration " + i + " success " + success);
}
} else {
//out.printf("Reading master file %s%n", masterFile);
DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", master.size()));
//out.printf("Reading test file %s%n", testFile);
DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", test.size()));
// out.printf("Master diff objects%n");
// out.println(master.toString());
// out.printf("Test diff objects%n");
// out.println(test.toString());
List<Difference> diffs = diffEngine.diff(master, test);
logger.info(String.format("Done computing diff with %d differences found", diffs.size()));
if ( showItemizedDifferences ) {
out.printf("Itemized results%n");
for ( Difference diff : diffs )
out.printf("DIFF: %s%n", diff.toString());
}
List<Difference> diffs = diffEngine.diff(master, test);
logger.info(String.format("Done computing diff with %d differences found", diffs.size()));
if ( showItemizedDifferences ) {
out.printf("Itemized results%n");
for ( Difference diff : diffs )
out.printf("DIFF: %s%n", diff.toString());
}
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary);
params.setDescending(false);
diffEngine.reportSummarizedDifferences(diffs, params);
logger.info(String.format("Done summarizing differences"));
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out,
MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff,
maxRawDiffsToSummary, doPairwise);
params.setDescending(false);
diffEngine.reportSummarizedDifferences(diffs, params);
logger.info(String.format("Done summarizing differences"));
}
}
}
}

View File

@ -29,11 +29,13 @@ import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.FeatureReader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
@ -79,9 +81,6 @@ public class VCFDiffableReader implements DiffableReader {
String prevName = "";
Iterator<VariantContext> it = reader.iterator();
while ( it.hasNext() ) {
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
break;
VariantContext vc = it.next();
String name = vc.getChr() + ":" + vc.getStart();
if ( name.equals(prevName) ) {
@ -109,9 +108,12 @@ public class VCFDiffableReader implements DiffableReader {
for (Genotype g : vc.getGenotypes() ) {
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
gRoot.add("GT", g.getGenotypeString());
gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 );
if ( g.hasGQ() ) gRoot.add("GQ", g.getGQ() );
if ( g.hasDP() ) gRoot.add("DP", g.getDP() );
if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD()));
if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL()));
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) {
for (Map.Entry<String, Object> attribute : g.getExtendedAttributes().entrySet()) {
if ( ! attribute.getKey().startsWith("_") )
gRoot.add(attribute.getKey(), attribute.getValue());
}
@ -120,6 +122,9 @@ public class VCFDiffableReader implements DiffableReader {
}
root.add(vcRoot);
count += vcRoot.size();
if ( count > maxElementsToRead && maxElementsToRead != -1)
break;
}
reader.close();

View File

@ -297,13 +297,14 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
// for each genotype, check filters then create a new object
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
List<String> filters = new ArrayList<String>(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) )
filters.add(exp.name);
}
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased()));
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
} else {
genotypes.add(g);
}

View File

@ -98,11 +98,9 @@ public class ConsensusAlleleCounter {
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if ( context.hasBasePileup() ) {
final ReadBackedPileup indelPileup = context.getBasePileup();
insCount += indelPileup.getNumberOfInsertionsAfterThisElement();
delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
}
final ReadBackedPileup indelPileup = context.getBasePileup();
insCount += indelPileup.getNumberOfInsertionsAfterThisElement();
delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
}
if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping )
@ -112,9 +110,6 @@ public class ConsensusAlleleCounter {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if ( !context.hasBasePileup() )
continue;
final ReadBackedPileup indelPileup = context.getBasePileup();
final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement();

View File

@ -89,7 +89,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param alternateAllelesToUse the alternate allele to use, null if not set
* @param allAllelesToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser
* @return variant context where genotypes are no-called but with GLs
@ -98,7 +98,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser);

View File

@ -35,8 +35,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -44,14 +43,13 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int HAPLOTYPE_SIZE;
private final boolean getAlleleListFromVCF;
private static final int HAPLOTYPE_SIZE = 80;
private boolean DEBUG = false;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private PairHMMIndelErrorModel pairModel;
private boolean allelesArePadded;
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
@ -75,124 +73,56 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
}
protected List<Allele> computeConsensusAlleles(ReferenceContext ref,
protected static List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenomeLocParser locParser) {
GenomeLocParser locParser, UnifiedArgumentCollection UAC) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
if (tracker == null)
return null;
GenomeLoc loc = ref.getLocus();
Allele refAllele, altAllele;
VariantContext vc = null;
boolean allelesArePadded = true;
if (!ref.getLocus().equals(lastSiteVisited)) {
// if (!ref.getLocus().equals(lastSiteVisited)) {
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
// starting a new site: clear allele list
alleleList.clear();
lastSiteVisited = ref.getLocus();
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
haplotypeMap.clear();
if (getAlleleListFromVCF) {
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) {
if (vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return null;
alleleList.clear();
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleleList.add(a);
} else {
for (Allele a : vc.getAlleles())
alleleList.add(a);
}
if (vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1)
allelesArePadded = false;
} else {
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
if (alleleList.isEmpty())
return null;
}
}
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
return null;
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
return null;
if (alleleList.isEmpty())
return null;
refAllele = alleleList.get(0);
altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
Pair<List<Allele>,Boolean> pair = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels);
alleleList = pair.first;
allelesArePadded = pair.second;
if (alleleList.isEmpty())
return null;
}
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
if (hsize <= 0) {
logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString()));
getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); // will update haplotypeMap adding elements
if (haplotypeMap == null || haplotypeMap.isEmpty())
return null;
}
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases);
// start making the VariantContext
// For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base.
int endLoc = loc.getStart() + refAllele.length()-1;
if (allelesArePadded)
endLoc++;
final int endLoc = computeEndLocation(alleleList, loc,allelesArePadded);
final int eventLength = getEventLength(alleleList);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase());
// create the genotypes; no-call everyone for now
@ -206,23 +136,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if (context.hasBasePileup()) {
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
b.PL(genotypeLikelihoods);
b.DP(getFilteredDepth(pileup));
genotypes.add(b.make());
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
for (int k = 0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
}
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
for (int k = 0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
}
}
}
@ -234,6 +160,102 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get();
}
public static int computeEndLocation(final List<Allele> alleles, final GenomeLoc loc, final boolean allelesArePadded) {
Allele refAllele = alleles.get(0);
int endLoc = loc.getStart() + refAllele.length()-1;
if (allelesArePadded)
endLoc++;
return endLoc;
}
public static void getHaplotypeMapFromAlleles(final List<Allele> alleleList,
final ReferenceContext ref,
final GenomeLoc loc,
final LinkedHashMap<Allele, Haplotype> haplotypeMap) {
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
haplotypeMap.clear();
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
haplotypeMap.clear();
else if (alleleList.isEmpty())
haplotypeMap.clear();
else {
final int eventLength = getEventLength(alleleList);
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases));
}
}
public static int getEventLength(List<Allele> alleleList) {
Allele refAllele = alleleList.get(0);
Allele altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
}
return altAllele.getBaseString().length() - refAllele.getBaseString().length();
}
public static Pair<List<Allele>,Boolean> getInitialAlleleList(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenomeLocParser locParser,
final UnifiedArgumentCollection UAC,
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
List<Allele> alleles = new ArrayList<Allele>();
boolean allelesArePadded = true;
if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vc = null;
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
if (vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return new Pair<List<Allele>,Boolean>(alleles,false);
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleles.add(a);
} else {
alleles.addAll(vc.getAlleles());
}
if ( vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1)
allelesArePadded = false;
} else {
alleles = IndelGenotypeLikelihoodsCalculationModel.computeConsensusAlleles(ref, contexts, contextType, locParser, UAC);
}
return new Pair<List<Allele>,Boolean> (alleles,allelesArePadded);
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) {

View File

@ -62,7 +62,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
@ -70,11 +70,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
final Allele refAllele = Allele.create(refBase, true);
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// calculate the GLs
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
@ -90,9 +85,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
}
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// find the alternate allele(s) that we should be using
if ( alternateAllelesToUse != null ) {
alleles.addAll(alternateAllelesToUse);
if ( allAllelesToUse != null ) {
alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele
} else if ( useAlleleFromVCF ) {
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
@ -156,12 +158,11 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
myLikelihoods[i] = allLikelihoods[PLordering[i]];
// normalize in log space so that max element is zero.
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
final HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name);
final double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(myLikelihoods, false, true);
gb.PL(genotypeLikelihoods);
gb.DP(sampleData.depth);
genotypes.add(gb.make());
}
return builder.genotypes(genotypes).make();

View File

@ -65,18 +65,15 @@ public class UnifiedArgumentCollection {
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
* is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might
* be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to
* over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4).
* is the default).
*/
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
/**
* the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less
* than the calling threshold are emitted but marked as filtered.
* This argument allows you to emit low quality calls as filtered records.
*/
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
/**

View File

@ -252,7 +252,7 @@ public class UnifiedGenotyperEngine {
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make();
}
if ( annotationEngine != null && rawContext.hasBasePileup() ) {
if ( annotationEngine != null ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
@ -378,10 +378,10 @@ public class UnifiedGenotyperEngine {
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
List<Allele> allAllelesToUse = builder.make().getAlleles();
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine {
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
@ -422,7 +422,7 @@ public class UnifiedGenotyperEngine {
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
if ( annotationEngine != null && !limitedContext ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
@ -441,7 +441,7 @@ public class UnifiedGenotyperEngine {
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
if ( !BaseUtils.isRegularBase(refContext.getBase()) || !rawContext.hasBasePileup() )
if ( !BaseUtils.isRegularBase(refContext.getBase()) )
return null;
Map<String, AlignmentContext> stratifiedContexts = null;
@ -507,9 +507,7 @@ public class UnifiedGenotyperEngine {
int depth = 0;
if ( isCovered ) {
AlignmentContext context = contexts.get(sample);
if ( context.hasBasePileup() )
depth = context.getBasePileup().depthOfCoverage();
depth = contexts.get(sample).getBasePileup().depthOfCoverage();
}
P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth);
@ -571,37 +569,35 @@ public class UnifiedGenotyperEngine {
final List<GenotypeLikelihoodsCalculationModel.Model> models = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2);
if ( rawContext.hasBasePileup() ) {
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
if ( vcInput == null )
return models;
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
if ( vcInput == null )
return models;
if ( vcInput.isSNP() ) {
// ignore SNPs if the user chose INDEL mode only
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH )
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") )
models.add(UAC.GLmodel);
}
else if ( vcInput.isIndel() || vcInput.isMixed() ) {
// ignore INDELs if the user chose SNP mode only
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH )
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
else if (UAC.GLmodel.name().toUpperCase().contains("INDEL"))
models.add(UAC.GLmodel);
}
// No support for other types yet
if ( vcInput.isSNP() ) {
// ignore SNPs if the user chose INDEL mode only
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH )
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") )
models.add(UAC.GLmodel);
}
else if ( vcInput.isIndel() || vcInput.isMixed() ) {
// ignore INDELs if the user chose SNP mode only
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH )
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
else if (UAC.GLmodel.name().toUpperCase().contains("INDEL"))
models.add(UAC.GLmodel);
}
// No support for other types yet
}
else {
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) {
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
}
else {
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) {
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
}
else {
models.add(UAC.GLmodel);
}
models.add(UAC.GLmodel);
}
}

View File

@ -117,7 +117,7 @@ public class PairHMMIndelErrorModel {
}
static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
// compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG
// 001000012345000000
@ -164,10 +164,24 @@ public class PairHMMIndelErrorModel {
}
}
}
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele, Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap){
final int numHaplotypes = haplotypeMap.size();
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
final int readCounts[] = new int[pileup.getNumberOfElements()];
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts);
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext ref,
final int eventLength,
final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap,
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0;
@ -367,7 +381,7 @@ public class PairHMMIndelErrorModel {
}
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
return readLikelihoods;
}
private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
@ -385,7 +399,7 @@ public class PairHMMIndelErrorModel {
return b1.length;
}
private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix

View File

@ -185,38 +185,36 @@ public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Eve
}
// look at the normal context to get deletions and positions with high entropy
if ( context.hasBasePileup() ) {
final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup();
int mismatchQualities = 0, totalQualities = 0;
final byte refBase = ref.getBase();
for ( PileupElement p : pileup ) {
int mismatchQualities = 0, totalQualities = 0;
final byte refBase = ref.getBase();
for ( PileupElement p : pileup ) {
// check the ends of the reads to see how far they extend
furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd());
// check the ends of the reads to see how far they extend
furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd());
// is it a deletion or insertion?
if ( p.isDeletion() || p.isBeforeInsertion() ) {
hasIndel = true;
if ( p.isBeforeInsertion() )
hasInsertion = true;
}
// look for mismatches
else if ( lookForMismatchEntropy ) {
if ( p.getBase() != refBase )
mismatchQualities += p.getQual();
totalQualities += p.getQual();
}
// is it a deletion or insertion?
if ( p.isDeletion() || p.isBeforeInsertion() ) {
hasIndel = true;
if ( p.isBeforeInsertion() )
hasInsertion = true;
}
// make sure we're supposed to look for high entropy
if ( lookForMismatchEntropy &&
pileup.getNumberOfElements() >= minReadsAtLocus &&
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
hasPointEvent = true;
// look for mismatches
else if ( lookForMismatchEntropy ) {
if ( p.getBase() != refBase )
mismatchQualities += p.getQual();
totalQualities += p.getQual();
}
}
// make sure we're supposed to look for high entropy
if ( lookForMismatchEntropy &&
pileup.getNumberOfElements() >= minReadsAtLocus &&
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
hasPointEvent = true;
// return null if no event occurred
if ( !hasIndel && !hasPointEvent )
return null;

View File

@ -316,6 +316,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
// first, the basic info
headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector"));
headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
// FORMAT and INFO fields
// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
@ -616,7 +617,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+
"has no Normal/Tumor tag associated with it");
// String rg = (String)read.getAttribute("RG");
// String rg = (String)read.getExtendedAttribute("RG");
// if ( rg == null )
// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls.");
@ -1147,13 +1148,12 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
GenotypesContext genotypes = GenotypesContext.create();
for ( String sample : normalSamples ) {
Map<String,Object> attrs = call.makeStatsAttributes(null);
if ( ! discard_event ) // we made a call - put actual het genotype here:
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
final GenotypeBuilder gb = new GenotypeBuilder(sample);
gb.attributes(call.makeStatsAttributes(null));
gb.alleles(! discard_event
? alleles // we made a call - put actual het genotype here:
: homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
genotypes.add(gb.make());
}
Set<String> filters = null;
@ -1237,11 +1237,11 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
GenotypesContext genotypes = GenotypesContext.create();
for ( String sample : normalSamples ) {
genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false));
genotypes.add(GenotypeBuilder.create(sample, homRefN ? homRefAlleles : alleles, attrsNormal));
}
for ( String sample : tumorSamples ) {
genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) );
genotypes.add(GenotypeBuilder.create(sample, homRefT ? homRefAlleles : alleles, attrsTumor));
}
Set<String> filters = null;
@ -2143,7 +2143,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
class VCFIndelAttributes {
public static String ALLELIC_DEPTH_KEY = "AD";
public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY;
public static String MAPQ_KEY = "MQS";

View File

@ -97,10 +97,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
private ArrayList<Sample> trios = new ArrayList<Sample>();
//Matrix of priors for all genotype combinations
private EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> mvCountMatrix;
private EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>> mvCountMatrix;
//Matrix of allele transmission
private EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>> transmissionMatrix;
private EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>> transmissionMatrix;
//Metrics counters hash keys
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
@ -138,17 +138,17 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
private ArrayList<Allele> getAlleles(Genotype.Type genotype){
private ArrayList<Allele> getAlleles(GenotypeType genotype){
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
if(genotype == Genotype.Type.HOM_REF){
if(genotype == GenotypeType.HOM_REF){
alleles.add(REF);
alleles.add(REF);
}
else if(genotype == Genotype.Type.HET){
else if(genotype == GenotypeType.HET){
alleles.add(REF);
alleles.add(VAR);
}
else if(genotype == Genotype.Type.HOM_VAR){
else if(genotype == GenotypeType.HOM_VAR){
alleles.add(VAR);
alleles.add(VAR);
}
@ -158,27 +158,34 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
return alleles;
}
private boolean isPhasable(Genotype.Type genotype){
return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR;
private boolean isPhasable(GenotypeType genotype){
return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR;
}
//Create a new Genotype based on information from a single individual
//Homozygous genotypes will be set as phased, heterozygous won't be
private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true));
}
else
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false));
private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){
boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR;
trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase));
}
private Genotype makeGenotype(final GenotypeType type, boolean phase) {
return makeGenotype(getAlleles(type), phase);
}
private Genotype makeGenotype(final List<Allele> alleles, boolean phase) {
final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles);
gb.phased(phase);
return gb.make();
}
//Find the phase for a parent/child pair
private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){
//Special case for Het/Het as it is ambiguous
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){
trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false));
return;
}
@ -190,34 +197,34 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//If there is a possible phasing between the parent and child => phase
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
if(childTransmittedAlleleIndex > -1){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true));
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
if(parent.equals(FamilyMember.MOTHER))
childPhasedAlleles.add(childAlleles.get(0));
else
childPhasedAlleles.add(0,childAlleles.get(0));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true));
}
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
parentPhasedAlleles.add(parentAlleles.get(1));
parentPhasedAlleles.add(parentAlleles.get(0));
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true));
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
if(parent.equals(FamilyMember.MOTHER))
childPhasedAlleles.add(childAlleles.get(0));
else
childPhasedAlleles.add(0,childAlleles.get(0));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true));
}
//This is a Mendelian Violation => Do not phase
else{
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false));
}
}
//Phases a family by transmission
private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){
Set<ArrayList<Allele>> possiblePhasedChildGenotypes = new HashSet<ArrayList<Allele>>();
ArrayList<Allele> motherAlleles = getAlleles(mother);
@ -246,7 +253,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
motherPhasedAlleles.add(motherAlleles.get(0));
else
motherPhasedAlleles.add(motherAlleles.get(1));
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true));
//Create father's genotype
ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2);
@ -255,10 +262,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
fatherPhasedAlleles.add(fatherAlleles.get(0));
else
fatherPhasedAlleles.add(fatherAlleles.get(1));
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true));
//Create child's genotype
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true));
//Once a phased combination is found; exit
return;
@ -266,16 +273,16 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
}
//If this is reached then no phasing could be found
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false));
trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false));
}
/* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
or single individual.
*/
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){
//Take care of cases where one or more family members are no call
if(!isPhasable(child)){
@ -297,7 +304,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
}
//Special case for Het/Het/Het as it is ambiguous
else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){
phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
@ -311,7 +318,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){
ArrayList<Allele> childAlleles = new ArrayList<Allele>(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles());
childAlleles.add(childAlleles.remove(0));
trioPhasedGenotypes.put(FamilyMember.CHILD,new Genotype(DUMMY_NAME,childAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true));
}
}
@ -347,7 +354,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Add the transmission probability
Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
genotypeAttributes.putAll(genotype.getAttributes());
genotypeAttributes.putAll(genotype.getExtendedAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB)
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
@ -370,7 +377,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
else
log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType());
return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased());
return new GenotypeBuilder(genotype).alleles(phasedAlleles)
.log10PError(log10Error)
.attributes(genotypeAttributes)
.phased(phasedGenotype.isPhased()).make();
}
@ -438,15 +448,15 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Create the transmission matrices
private void buildMatrices(){
mvCountMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
transmissionMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>>(Genotype.Type.class);
for(Genotype.Type mother : Genotype.Type.values()){
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class));
for(Genotype.Type father : Genotype.Type.values()){
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class));
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class));
for(Genotype.Type child : Genotype.Type.values()){
mvCountMatrix = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>>(GenotypeType.class);
transmissionMatrix = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>>(GenotypeType.class);
for(GenotypeType mother : GenotypeType.values()){
mvCountMatrix.put(mother,new EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>(GenotypeType.class));
transmissionMatrix.put(mother,new EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>(GenotypeType.class));
for(GenotypeType father : GenotypeType.values()){
mvCountMatrix.get(mother).put(father,new EnumMap<GenotypeType, Integer>(GenotypeType.class));
transmissionMatrix.get(mother).put(father,new EnumMap<GenotypeType,TrioPhase>(GenotypeType.class));
for(GenotypeType child : GenotypeType.values()){
mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
}
@ -457,16 +467,16 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Returns the number of Mendelian Violations for a given genotype combination.
//If one of the parents genotype is missing, it will consider it as a parent/child pair
//If the child genotype or both parents genotypes are missing, 0 is returned.
private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){
//Child is no call => No MV
if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE)
return 0;
//Add parents with genotypes for the evaluation
ArrayList<Genotype.Type> parents = new ArrayList<Genotype.Type>();
if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
ArrayList<GenotypeType> parents = new ArrayList<GenotypeType>();
if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE))
parents.add(mother);
if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE))
parents.add(father);
//Both parents no calls => No MV
@ -477,35 +487,35 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
int parentsNumRefAlleles = 0;
int parentsNumAltAlleles = 0;
for(Genotype.Type parent : parents){
if(parent == Genotype.Type.HOM_REF){
for(GenotypeType parent : parents){
if(parent == GenotypeType.HOM_REF){
parentsNumRefAlleles++;
}
else if(parent == Genotype.Type.HET){
else if(parent == GenotypeType.HET){
parentsNumRefAlleles++;
parentsNumAltAlleles++;
}
else if(parent == Genotype.Type.HOM_VAR){
else if(parent == GenotypeType.HOM_VAR){
parentsNumAltAlleles++;
}
}
//Case Child is HomRef
if(child == Genotype.Type.HOM_REF){
if(child == GenotypeType.HOM_REF){
if(parentsNumRefAlleles == parents.size())
return 0;
else return (parents.size()-parentsNumRefAlleles);
}
//Case child is HomVar
if(child == Genotype.Type.HOM_VAR){
if(child == GenotypeType.HOM_VAR){
if(parentsNumAltAlleles == parents.size())
return 0;
else return parents.size()-parentsNumAltAlleles;
}
//Case child is Het
if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
return 0;
//MV
@ -513,7 +523,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
}
//Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){
int count = 0;
if(motherOriginal!=motherNew)
count++;
@ -526,21 +536,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Get a Map of genotype likelihoods.
//In case of null, unavailable or no call, all likelihoods are 1/3.
private EnumMap<Genotype.Type,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){
private EnumMap<GenotypeType,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){
if(genotype == null || !genotype.isCalled()){
EnumMap<Genotype.Type,Double> likelihoods = new EnumMap<Genotype.Type, Double>(Genotype.Type.class);
likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0);
likelihoods.put(Genotype.Type.HET,1.0/3.0);
likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0);
EnumMap<GenotypeType,Double> likelihoods = new EnumMap<GenotypeType, Double>(GenotypeType.class);
likelihoods.put(GenotypeType.HOM_REF,1.0/3.0);
likelihoods.put(GenotypeType.HET,1.0/3.0);
likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0);
return likelihoods;
}
return genotype.getLikelihoods().getAsMap(true);
}
//Returns the Genotype.Type; returns UNVAILABLE if given null
private Genotype.Type getTypeSafeNull(Genotype genotype){
//Returns the GenotypeType; returns UNVAILABLE if given null
private GenotypeType getTypeSafeNull(Genotype genotype){
if(genotype == null)
return Genotype.Type.UNAVAILABLE;
return GenotypeType.UNAVAILABLE;
return genotype.getType();
}
@ -561,18 +571,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Always assign the first parent as the parent having genotype information in pairs
//Always assign the mother as the first parent in trios
int parentsCalled = 0;
Map<Genotype.Type,Double> firstParentLikelihoods;
Map<Genotype.Type,Double> secondParentLikelihoods;
ArrayList<Genotype.Type> bestFirstParentGenotype = new ArrayList<Genotype.Type>();
ArrayList<Genotype.Type> bestSecondParentGenotype = new ArrayList<Genotype.Type>();
ArrayList<Genotype.Type> bestChildGenotype = new ArrayList<Genotype.Type>();
Genotype.Type pairSecondParentGenotype = null;
Map<GenotypeType,Double> firstParentLikelihoods;
Map<GenotypeType,Double> secondParentLikelihoods;
ArrayList<GenotypeType> bestFirstParentGenotype = new ArrayList<GenotypeType>();
ArrayList<GenotypeType> bestSecondParentGenotype = new ArrayList<GenotypeType>();
ArrayList<GenotypeType> bestChildGenotype = new ArrayList<GenotypeType>();
GenotypeType pairSecondParentGenotype = null;
if(mother == null || !mother.isCalled()){
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
bestFirstParentGenotype.add(getTypeSafeNull(father));
bestSecondParentGenotype.add(getTypeSafeNull(mother));
pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType();
if(father != null && father.isCalled())
parentsCalled = 1;
}
@ -583,12 +593,12 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
bestSecondParentGenotype.add(getTypeSafeNull(father));
if(father == null || !father.isCalled()){
parentsCalled = 1;
pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType();
}else{
parentsCalled = 2;
}
}
Map<Genotype.Type,Double> childLikelihoods = getLikelihoodsAsMapSafeNull(child);
Map<GenotypeType,Double> childLikelihoods = getLikelihoodsAsMapSafeNull(child);
bestChildGenotype.add(getTypeSafeNull(child));
//Prior vars
@ -604,9 +614,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
int mvCount;
int cumulativeMVCount = 0;
double configurationLikelihood = 0;
for(Map.Entry<Genotype.Type,Double> childGenotype : childLikelihoods.entrySet()){
for(Map.Entry<Genotype.Type,Double> firstParentGenotype : firstParentLikelihoods.entrySet()){
for(Map.Entry<Genotype.Type,Double> secondParentGenotype : secondParentLikelihoods.entrySet()){
for(Map.Entry<GenotypeType,Double> childGenotype : childLikelihoods.entrySet()){
for(Map.Entry<GenotypeType,Double> firstParentGenotype : firstParentLikelihoods.entrySet()){
for(Map.Entry<GenotypeType,Double> secondParentGenotype : secondParentLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
//For parent/child pairs, sum over the possible genotype configurations of the missing parent
if(parentsCalled<2){
@ -797,9 +807,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(),
phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString());
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),
phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),Arrays.asList(phasedChild.getDP()),phasedChild.getAD(),phasedChild.getLikelihoodsString());
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
}
@ -809,8 +819,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString());
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
}
}
else{
@ -820,8 +830,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString());
phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
}
//Report violation if set so

View File

@ -109,14 +109,13 @@ class PhasingUtils {
}
double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
if (phaseQual.PQ != null)
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased);
Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(phaseQual.isPhased).make();
mergedGenotypes.add(mergedGt);
}

View File

@ -269,10 +269,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
logger.debug("Unprocessed variant = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
}
int numReads = 0;
if (context.hasBasePileup()) {
numReads = context.getBasePileup().getNumberOfElements();
}
int numReads = context.getBasePileup().getNumberOfElements();
PhasingStats addInPhaseStats = new PhasingStats(numReads, 1);
phaseStats.addIn(addInPhaseStats);
}
@ -288,7 +285,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
// for ( String sample : samplesToPhase )
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
VariantContext subvc = vc.subContextFromSamples(samplesToPhase);
VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true);
// logger.debug("original VC = " + vc);
// logger.debug("sub VC = " + subvc);
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
@ -374,7 +371,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (isUnfilteredCalledDiploidGenotype(gt)) {
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
// true <-> can trivially phase a hom site relative to ANY previous site:
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true);
Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make();
uvc.setGenotype(samp, phasedGt);
}
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
@ -408,9 +405,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
gtAttribs.put(PQ_KEY, pr.phaseQuality);
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
Genotype phasedGt = new GenotypeBuilder(gt)
.alleles(allelePair.getAllelesAsList())
.attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
uvc.setGenotype(samp, phasedGt);
}
@ -428,9 +426,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
interiorUvc.setPhasingInconsistent();
if (genotypesArePhased) {
Map<String, Object> handledGtAttribs = new HashMap<String, Object>(handledGt.getAttributes());
handledGtAttribs.put(PQ_KEY, pr.phaseQuality);
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased);
Genotype phasedHomGt = new GenotypeBuilder(handledGt)
.attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
interiorUvc.setGenotype(samp, phasedHomGt);
}
}
@ -1106,10 +1104,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
this.sampleReadBases = new HashMap<String, ReadBasesAtPosition>();
if (alignment != null) {
ReadBackedPileup pileup = null;
if (alignment.hasBasePileup()) {
pileup = alignment.getBasePileup();
}
ReadBackedPileup pileup = alignment.getBasePileup();
if (pileup != null) {
// filter the read-base pileup based on min base and mapping qualities:
pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE);
@ -1439,7 +1434,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) {
return (gt.isNotFiltered() && gt.isCalled() && gt.getPloidy() == 2);
return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2);
}
private class MultipleBaseCountsWriter {

View File

@ -365,7 +365,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
return counter;
// Do not operate on variants that are not covered to the optional minimum depth
if (!context.hasReads() || !context.hasBasePileup() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
counter.nUncovered = 1L;
if (vcComp.getAttribute("GV").equals("T"))
counter.nAltNotCalled = 1L;
@ -423,7 +423,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
}
}
else {
// if (!vcComp.hasAttribute("GV"))
// if (!vcComp.hasExtendedAttribute("GV"))
// throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
if (call.isCalledAlt(callConf)) {

View File

@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector {
return true;
// want to include a site in the given samples if it is *likely* to be variant (via the EXACT model)
// first subset to the samples
VariantContext subContext = vc.subContextFromSamples(samples);
VariantContext subContext = vc.subContextFromSamples(samples, true);
// now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec?

View File

@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{
if ( samples == null || samples.isEmpty() )
return true;
VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles());
VariantContext subContext = vc.subContextFromSamples(samples, false);
if ( subContext.isPolymorphicInSamples() ) {
return true;
}

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Molten;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeType;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -54,7 +55,7 @@ public class GenotypeConcordance extends VariantEvaluator {
* Initialize this object
*/
public GenotypeConcordance() {
final int nGenotypeTypes = Genotype.Type.values().length;
final int nGenotypeTypes = GenotypeType.values().length;
truthByCalledGenotypeCounts = new long[nGenotypeTypes][nGenotypeTypes];
}
@ -75,11 +76,11 @@ public class GenotypeConcordance extends VariantEvaluator {
if (eval != null) {
for (final Genotype g : eval.getGenotypes() ) {
final String sample = g.getSampleName();
final Genotype.Type called = g.getType();
final Genotype.Type truth;
final GenotypeType called = g.getType();
final GenotypeType truth;
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
truth = Genotype.Type.NO_CALL;
truth = GenotypeType.NO_CALL;
} else {
truth = validation.getGenotype(sample).getType();
}
@ -90,19 +91,19 @@ public class GenotypeConcordance extends VariantEvaluator {
// otherwise, mark no-calls for all samples
else {
final Genotype.Type called = Genotype.Type.NO_CALL;
final GenotypeType called = GenotypeType.NO_CALL;
for (final Genotype g : validation.getGenotypes()) {
final Genotype.Type truth = g.getType();
final GenotypeType truth = g.getType();
incrValue(truth, called);
// print out interesting sites
/*
if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) {
if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) {
if ( (truth == GenotypeType.HOM_VAR || truth == GenotypeType.HET) && called == GenotypeType.NO_CALL ) {
super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation);
}
if ( (called == Genotype.Type.HOM_VAR || called == Genotype.Type.HET) && truth == Genotype.Type.HOM_REF ) {
if ( (called == GenotypeType.HOM_VAR || called == GenotypeType.HET) && truth == GenotypeType.HOM_REF ) {
super.getVEWalker().gcLog.printf("%s FP %s%n", group, validation);
}
}
@ -121,36 +122,36 @@ public class GenotypeConcordance extends VariantEvaluator {
* @param truth the truth type
* @param called the called type
*/
private void incrValue(final Genotype.Type truth, final Genotype.Type called) {
private void incrValue(final GenotypeType truth, final GenotypeType called) {
truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]++;
}
private long count(final Genotype.Type truth, final Genotype.Type called) {
private long count(final GenotypeType truth, final GenotypeType called) {
return truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()];
}
private long count(final EnumSet<Genotype.Type> truth, final Genotype.Type called) {
private long count(final EnumSet<GenotypeType> truth, final GenotypeType called) {
return count(truth, EnumSet.of(called));
}
private long count(final Genotype.Type truth, final EnumSet<Genotype.Type> called) {
private long count(final GenotypeType truth, final EnumSet<GenotypeType> called) {
return count(EnumSet.of(truth), called);
}
private long count(final EnumSet<Genotype.Type> truth, final EnumSet<Genotype.Type> called) {
private long count(final EnumSet<GenotypeType> truth, final EnumSet<GenotypeType> called) {
long sum = 0;
for ( final Genotype.Type truth1 : truth ) {
for ( final Genotype.Type called1 : called ) {
for ( final GenotypeType truth1 : truth ) {
for ( final GenotypeType called1 : called ) {
sum += count(truth1, called1);
}
}
return sum;
}
private long countDiag( final EnumSet<Genotype.Type> d1 ) {
private long countDiag( final EnumSet<GenotypeType> d1 ) {
long sum = 0;
for(final Genotype.Type e1 : d1 ) {
for(final GenotypeType e1 : d1 ) {
sum += truthByCalledGenotypeCounts[e1.ordinal()][e1.ordinal()];
}
@ -159,13 +160,13 @@ public class GenotypeConcordance extends VariantEvaluator {
@Override
public void finalizeEvaluation() {
final EnumSet<Genotype.Type> allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET);
final EnumSet<Genotype.Type> allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF);
final EnumSet<Genotype.Type> allGenotypes = EnumSet.allOf(Genotype.Type.class);
final EnumSet<GenotypeType> allVariantGenotypes = EnumSet.of(GenotypeType.HOM_VAR, GenotypeType.HET);
final EnumSet<GenotypeType> allCalledGenotypes = EnumSet.of(GenotypeType.HOM_VAR, GenotypeType.HET, GenotypeType.HOM_REF);
final EnumSet<GenotypeType> allGenotypes = EnumSet.allOf(GenotypeType.class);
// exact values of the table
for ( final Genotype.Type truth : Genotype.Type.values() ) {
for ( final Genotype.Type called : Genotype.Type.values() ) {
for ( final GenotypeType truth : GenotypeType.values() ) {
for ( final GenotypeType called : GenotypeType.values() ) {
final String field = String.format("n_true_%s_called_%s", truth, called);
final Long value = count(truth, called);
map.put(field, value.toString());
@ -173,20 +174,20 @@ public class GenotypeConcordance extends VariantEvaluator {
}
// counts of called genotypes
for ( final Genotype.Type called : Genotype.Type.values() ) {
for ( final GenotypeType called : GenotypeType.values() ) {
final String field = String.format("total_called_%s", called);
final Long value = count(allGenotypes, called);
map.put(field, value.toString());
}
// counts of true genotypes
for ( final Genotype.Type truth : Genotype.Type.values() ) {
for ( final GenotypeType truth : GenotypeType.values() ) {
final String field = String.format("total_true_%s", truth);
final Long value = count(truth, allGenotypes);
map.put(field, value.toString());
}
for ( final Genotype.Type genotype : Genotype.Type.values() ) {
for ( final GenotypeType genotype : GenotypeType.values() ) {
final String field = String.format("percent_%s_called_%s", genotype, genotype);
long numer = count(genotype, genotype);
long denom = count(EnumSet.of(genotype), allGenotypes);
@ -215,7 +216,7 @@ public class GenotypeConcordance extends VariantEvaluator {
// overall genotype concordance of sites called non-ref in eval track
// MAD: this is the non-reference discrepancy rate
final String field = "percent_non_reference_discrepancy_rate";
long homrefConcords = count(Genotype.Type.HOM_REF, Genotype.Type.HOM_REF);
long homrefConcords = count(GenotypeType.HOM_REF, GenotypeType.HOM_REF);
long allNoHomRef = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords;
long numer = allNoHomRef - countDiag(allVariantGenotypes);
long denom = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords;

View File

@ -121,9 +121,9 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
int ac = 0;
if ( vc.getNAlleles() > 2 ) {
return SiteStatus.POLY;
//// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
//// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getExtendedAttribute(VCFConstants.ALLELE_COUNT_KEY));
// // todo -- omg this is painful. We need a better approach to dealing with multi-valued attributes
// for ( String v : (List<String>)vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY) )
// for ( String v : (List<String>)vc.getExtendedAttribute(VCFConstants.ALLELE_COUNT_KEY) )
// ac += Integer.valueOf(v);
//// System.out.printf(" ac = %d%n", ac);
}

View File

@ -241,7 +241,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
// update transition / transversion ratio
if ( titvTable != null ) titvTable.inc(type, g.getSampleName());
if ( g.hasAttribute(VCFConstants.DEPTH_KEY) )
if ( g.hasDP() )
depthPerSample.inc(type, g.getSampleName());
}
}

View File

@ -199,7 +199,7 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples
*/
public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
VariantContext vcsub = vc.subContextFromSamples(sampleNames, vc.getAlleles());
VariantContext vcsub = vc.subContextFromSamples(sampleNames, false);
VariantContextBuilder builder = new VariantContextBuilder(vcsub);
final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount();

View File

@ -223,7 +223,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
newA = Allele.NO_CALL;
newAlleles.add(newA);
}
newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles));
newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make());
}
return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).referenceBaseForIndel(refBaseForIndel).make();

View File

@ -315,6 +315,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
private boolean fullyDecode = false;
@Hidden
@Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes. For efficiency testing only", required=false)
private boolean forceGenotypesDecode = false;
@Hidden
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
private boolean justRead = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure {
private VariantContext vc;
@ -392,11 +401,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
samples.removeAll(XLsamplesFromFile);
samples.removeAll(XLsampleNames);
NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty();
if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
throw new UserException("All samples requested to be included were also requested to be excluded.");
for ( String sample : samples )
if ( ! NO_SAMPLES_SPECIFIED )
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
@ -494,7 +505,16 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
for (VariantContext vc : vcs) {
if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
// an option for performance testing only
if ( fullyDecode )
vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
// an option for performance testing only
if ( forceGenotypesDecode ) {
final int x = vc.getCalledChrCount();
//logger.info("forceGenotypesDecode with getCalledChrCount() = " + );
}
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
continue;
@ -538,7 +558,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType()))
continue;
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS);
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(tracker, ref, context, sub)).filters(sub.getFiltersMaybeNull());
@ -559,7 +579,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
randomlyAddVariant(++variantNumber, sub);
}
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
vcfWriter.add(sub);
if ( ! justRead )
vcfWriter.add(sub);
}
}
}
@ -687,18 +708,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
*
* @param vc the VariantContext record to subset
* @param samples the samples to extract
* @return the subsetted VariantContext
*/
private VariantContext subsetRecord(final VariantContext vc, final Set<String> samples, final boolean excludeNonVariants) {
if ( samples == null || samples.isEmpty() )
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) {
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
return vc;
final VariantContext sub;
if ( excludeNonVariants )
sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used
else
sub = vc.subContextFromSamples(samples, vc.getAlleles());
final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
VariantContextBuilder builder = new VariantContextBuilder(sub);
GenotypesContext newGC = sub.getGenotypes();
@ -708,15 +725,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
//Remove a fraction of the genotypes if needed
if(fractionGenotypes>0){
if ( fractionGenotypes > 0 ){
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
alleles.add(Allele.create((byte)'.'));
alleles.add(Allele.create((byte)'.'));
genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap<String, Object>(),false));
List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
}
else{
genotypes.add(genotype);
@ -750,14 +765,12 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
for (String sample : originalVC.getSampleNames()) {
Genotype g = originalVC.getGenotype(sample);
if ( g.isNotFiltered() ) {
String dp = (String) g.getAttribute("DP");
if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) {
depth += Integer.valueOf(dp);
}
if ( ! g.isFiltered() ) {
if ( g.hasDP() )
depth += g.getDP();
}
}
builder.attribute("DP", depth);
}

View File

@ -288,8 +288,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getStandardEncoding(Genotype g, int offset) {
byte b;
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
b = NO_CALL;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_REF;
} else if ( g.isHomVar() ) {
@ -305,7 +305,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getFlippedEncoding(Genotype g, int offset) {
byte b;
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_VAR;

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -314,8 +315,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( addGenotypeFields ) {
for ( final String sample : samples ) {
for ( final String gf : genotypeFields ) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf) )
addFieldValue(vc.getGenotype(sample).getAttribute(gf), records);
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) {
if ( gf.equals(VCFConstants.GENOTYPE_KEY) )
addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records);
else
addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records);
}
else
addFieldValue(MISSING_DATA, records);
}

View File

@ -132,7 +132,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
// set the appropriate sample name if necessary
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) {
Genotype g = Genotype.modifyName(vc.getGenotype(variants.getName()), sampleName);
Genotype g = new GenotypeBuilder(vc.getGenotype(variants.getName())).name(sampleName).make();
builder.genotypes(g);
}

View File

@ -1,9 +1,5 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
@ -16,10 +12,8 @@ import java.util.Map;
*/
public class BitSetUtils {
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers)
static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers)
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
/**
* Creates an long out of a bitset
@ -112,173 +106,4 @@ public class BitSetUtils {
}
return bitSet;
}
/**
* Converts a BitSet into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param bitSet the bitset representation of the dna sequence
* @return the dna sequence represented by the bitset
*/
public static String dnaFrom(final BitSet bitSet) {
long number = longFrom(bitSet); // the base_10 representation of the bit set
if (number < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
StringBuilder dna = new StringBuilder();
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4);
switch (base) {
case 0:
dna.append('A');
break;
case 1:
dna.append('C');
break;
case 2:
dna.append('G');
break;
case 3:
dna.append('T');
break;
}
number /= 4;
}
for (int j = dna.length(); j < length; j++)
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
}
/**
* Creates a BitSet representation of a given dna string.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the bitset representing the dna sequence
*/
public static BitSet bitSetFrom(String dna) {
return bitSetFrom(dna.getBytes());
}
public static BitSet bitSetFrom(final byte[] dna) {
if (dna.length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
for (final byte base : dna) {
baseTen *= 4;
baseTen += BaseUtils.simpleBaseToBaseIndex(base);
}
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
/**
* Calculates the number of bits necessary to represent a given number of elements
*
* @param numberOfElements the number of elements to represent (must be positive)
* @return the number of bits necessary to represent this many elements
*/
public static int numberOfBitsToRepresent(long numberOfElements) {
if (numberOfElements < 0)
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
if (numberOfElements == 1L)
return 1; // special case
int n = 0;
numberOfElements--;
while (numberOfElements > 0) {
numberOfElements = numberOfElements >> 1;
n++;
}
return n;
}
/**
* Calculates the length of the DNA context for a given base 10 number
*
* It is important to know the length given the base 10 number to calculate the number of combinations
* and to disambiguate the "quasi-canonical" state.
*
* This method also calculates the number of combinations as a by-product, but since it memoizes the
* results, a subsequent call to combinationsFor(length) is O(1).
*
* @param number the base 10 representation of the bitset
* @return the length of the DNA context represented by this number
*/
private static int contextLengthFor(long number) {
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it).
while (combinations <= number) { // find the length of the dna string (length)
length++;
combinations = combinationsFor(length); // calculate the next context
}
return length;
}
/**
* The sum of all combinations of a context of a given length from length = 0 to length.
*
* Memoized implementation of sum(4^i) , where i=[0,length]
*
* @param length the length of the DNA context
* @return the sum of all combinations leading up to this context length.
*/
private static long combinationsFor(int length) {
if (length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length));
// only calculate the number of combinations if the table hasn't already cached the value
if (length > 0 && combinationsPerLength[length] == 0) {
long combinations = 0L;
for (int i = 1; i <= length; i++)
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
combinationsPerLength[length] = combinations;
}
return combinationsPerLength[length];
}
public static byte[] sizeOf(Object obj) throws java.io.IOException
{
ByteArrayOutputStream byteObject = new ByteArrayOutputStream();
ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject);
objectOutputStream.writeObject(obj);
objectOutputStream.flush();
objectOutputStream.close();
byteObject.close();
return byteObject.toByteArray();
}
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeType;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -30,7 +31,7 @@ public class MendelianViolation {
private boolean allCalledOnly = true;
//Stores occurrences of inheritance
private EnumMap<Genotype.Type, EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> inheritance;
private EnumMap<GenotypeType, EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>> inheritance;
private int violations_total=0;
@ -74,119 +75,119 @@ public class MendelianViolation {
//Count of HomRef/HomRef/HomRef trios
public int getRefRefRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
}
//Count of HomVar/HomVar/HomVar trios
public int getVarVarVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR);
}
//Count of HomRef/HomVar/Het trios
public int getRefVarHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) +
inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HET) +
inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
}
//Count of Het/Het/Het trios
public int getHetHetHet(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET);
}
//Count of Het/Het/HomRef trios
public int getHetHetHomRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
}
//Count of Het/Het/HomVar trios
public int getHetHetHomVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR);
}
//Count of ref alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET)
+ 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
//return parentsHetHet_childRef;
}
//Count of var alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET)
+ 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR);
//return parentsHetHet_childVar;
}
//Count of ref alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_REF)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
//return parentsHomRefHet_childRef;
}
//Count of var alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HET)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
//return parentsHomRefHet_childVar;
}
//Count of ref alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HET)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HET);
//return parentsHomVarHet_childRef;
}
//Count of var alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_VAR)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR);
//return parentsHomVarHet_childVar;
}
//Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR
public int getParentsRefRefChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_REF -> HET
public int getParentsRefRefChildHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
}
//Count of violations of the type HOM_REF/HET -> HOM_VAR
public int getParentsRefHetChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_VAR)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR
public int getParentsRefVarChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR)
+ inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF
public int getParentsRefVarChildRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF)
+ inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
}
//Count of violations of the type HOM_VAR/HET -> HOM_REF
public int getParentsVarHetChildRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF)
+ inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
}
//Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF
public int getParentsVarVarChildRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF);
}
//Count of violations of the type HOM_VAR/HOM_VAR -> HET
public int getParentsVarVarChildHet(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HET);
}
@ -362,12 +363,12 @@ public class MendelianViolation {
private void createInheritanceMap(){
inheritance = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
for(Genotype.Type mType : Genotype.Type.values()){
inheritance.put(mType, new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
for(Genotype.Type dType : Genotype.Type.values()){
inheritance.get(mType).put(dType, new EnumMap<Genotype.Type,Integer>(Genotype.Type.class));
for(Genotype.Type cType : Genotype.Type.values()){
inheritance = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>>(GenotypeType.class);
for(GenotypeType mType : GenotypeType.values()){
inheritance.put(mType, new EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>(GenotypeType.class));
for(GenotypeType dType : GenotypeType.values()){
inheritance.get(mType).put(dType, new EnumMap<GenotypeType,Integer>(GenotypeType.class));
for(GenotypeType cType : GenotypeType.values()){
inheritance.get(mType).get(dType).put(cType, 0);
}
}
@ -376,9 +377,9 @@ public class MendelianViolation {
}
private void clearInheritanceMap(){
for(Genotype.Type mType : Genotype.Type.values()){
for(Genotype.Type dType : Genotype.Type.values()){
for(Genotype.Type cType : Genotype.Type.values()){
for(GenotypeType mType : GenotypeType.values()){
for(GenotypeType dType : GenotypeType.values()){
for(GenotypeType cType : GenotypeType.values()){
inheritance.get(mType).get(dType).put(cType, 0);
}
}

View File

@ -225,9 +225,9 @@ public class SequenceDictionaryUtils {
return false;
// todo -- reenable if we want to be really strict here
// if (me.getAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getAttribute(SAMSequenceRecord.MD5_TAG) != null) {
// final BigInteger thisMd5 = new BigInteger((String)me.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
// final BigInteger thatMd5 = new BigInteger((String)that.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) {
// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (!thisMd5.equals(thatMd5)) {
// return false;
// }

View File

@ -223,6 +223,20 @@ public class Utils {
return ret.toString();
}
public static String join(String separator, int[] ints) {
if ( ints == null || ints.length == 0)
return "";
else {
StringBuilder ret = new StringBuilder();
ret.append(ints[0]);
for (int i = 1; i < ints.length; ++i) {
ret.append(separator);
ret.append(ints[i]);
}
return ret.toString();
}
}
/**
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
* elti objects (note there's no actual space between sep and the elti elements). Returns

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
@ -33,9 +35,7 @@ import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -45,15 +45,45 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
/**
* Decode BCF2 files
*/
public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null;
/**
* Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
*/
private final ArrayList<String> contigNames = new ArrayList<String>();
/**
* Maps header string names (encoded in VCF) into strings found in the BCF header
*
* Initialized when processing the header
*/
private ArrayList<String> dictionary;
/**
* Our decoder that reads low-level objects from the BCF2 records
*/
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
/**
* Provides some sanity checking on the header
*/
private final static int MAX_HEADER_SIZE = 0x08000000;
/**
* Genotype field decoders that are initialized when the header is read
*/
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
// for error handling
private int recordNo = 0;
private int pos = 0;
// ----------------------------------------------------------------------
//
// Feature codec interface functions
@ -62,28 +92,30 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
@Override
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
return decode(inputStream);
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
// TODO: and then skip genotypes block
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream
decoder.readNextBlock(sitesBlockSize, inputStream);
decodeSiteLoc(builder);
return builder.fullyDecoded(true).make();
}
@Override
public VariantContext decode( final PositionalBufferedStream inputStream ) {
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
decoder.readNextBlock(sitesBlockSize, inputStream);
final SitesInfoForDecoding info = decodeSitesBlock(builder);
if ( isSkippingGenotypes() ) {
decoder.skipNextBlock(genotypeBlockSize, inputStream);
} else {
decoder.readNextBlock(genotypeBlockSize, inputStream);
decodeGenotypes(info, builder);
}
decodeSiteLoc(builder);
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);
decoder.readNextBlock(genotypeBlockSize, inputStream);
createLazyGenotypesDecoder(info, builder);
return builder.fullyDecoded(true).make();
}
@ -97,16 +129,16 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
try {
// note that this reads the magic as well, and so does double duty
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic");
error("Input stream does not begin with BCF2 magic");
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
final byte[] headerBytes = new byte[headerSizeInBytes];
if ( inputStream.read(headerBytes) != headerSizeInBytes )
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
final AsciiLineReader headerReader = new AsciiLineReader(bps);
@ -118,12 +150,24 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
}
// create the config offsets
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigNames.add(contig.getID());
if ( ! header.getContigLines().isEmpty() ) {
logger.info("Found contig lines in BCF2 file, using those");
contigNames.clear();
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
if ( contig.getID() == null || contig.getID().equals("") )
error("found a contig with an invalid ID " + contig);
contigNames.add(contig.getID());
}
} else {
logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary");
}
// create the string dictionary
dictionary = parseDictionary(header);
// prepare the genotype field decoders
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
// position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition());
}
@ -153,7 +197,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
//
// --------------------------------------------------------------------------------
@Override
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
// initialize contigNames to standard ones in reference
@ -161,14 +204,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
contigNames.add(contig.getSequenceName());
}
public boolean isSkippingGenotypes() {
return skipGenotypes;
}
public void setSkipGenotypes(final boolean skipGenotypes) {
this.skipGenotypes = skipGenotypes;
}
// --------------------------------------------------------------------------------
//
// implicit block
@ -182,50 +217,83 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
//
// --------------------------------------------------------------------------------
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null"})
private final void decodeSiteLoc(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
final String contig = lookupContigName(contigOffset);
builder.chr(contig);
final int pos = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
final int refLength = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
this.pos = decoder.decodeInt(BCF2Type.INT32);
final int refLength = decoder.decodeInt(BCF2Type.INT32);
builder.start((long)pos);
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
}
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null", "decoder != null"})
@Ensures({"result != null", "result.isValid()"})
private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) {
final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
if ( qual != null ) {
builder.log10PError(((Double)qual) / -10.0);
}
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
final int nAlleles = nAlleleInfo >> 16;
final int nInfo = nAlleleInfo & 0x00FF;
final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x0FFF;
final int nInfo = nAlleleInfo & 0x0000FFFF;
final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x00FFFFF;
decodeID(builder);
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
decodeFilter(builder);
decodeInfo(builder, nInfo);
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
if ( ! info.isValid() )
error("Sites info is malformed: " + info);
return info;
}
private final static class SitesInfoForDecoding {
final int pos;
protected final static class SitesInfoForDecoding {
final int nFormatFields;
final int nSamples;
final ArrayList<Allele> alleles;
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.pos = pos;
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.nFormatFields = nFormatFields;
this.nSamples = nSamples;
this.alleles = alleles;
}
public boolean isValid() {
return nFormatFields >= 0 &&
nSamples >= 0 &&
alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
}
@Override
public String toString() {
return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
}
}
/**
* Decode the id field in this BCF2 file and store it in the builder
* @param builder
*/
private void decodeID( final VariantContextBuilder builder ) {
final String id = (String)decoder.decodeTypedValue();
@ -235,6 +303,15 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.id(id);
}
/**
* Annoying routine that deals with allele clipping from the BCF2 encoding to the standard
* GATK encoding.
*
* @param position
* @param ref
* @param unclippedAlleles
* @return
*/
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
@ -244,6 +321,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return unclippedAlleles;
}
/**
* Decode the alleles from this BCF2 file and put the results in builder
* @param builder
* @param pos
* @param nAlleles
* @return the alleles
*/
@Requires("nAlleles > 0")
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
@ -259,15 +344,21 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
alleles.add(Allele.create(allele, false));
}
}
assert ref != null;
alleles = clipAllelesIfNecessary(pos, ref, alleles);
builder.alleles(alleles);
assert ref.length() > 0;
builder.referenceBaseForIndel(ref.getBytes()[0]);
return alleles;
}
/**
* Decode the filter field of this BCF2 file and store the result in the builder
* @param builder
*/
private void decodeFilter( final VariantContextBuilder builder ) {
final Object value = decoder.decodeTypedValue();
@ -275,17 +366,28 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.unfiltered();
else {
if ( value instanceof Integer )
// fast path for single integer result
builder.filter(getDictionaryString((Integer)value));
else {
for ( int offset : (List<Integer>)value )
for ( final int offset : (List<Integer>)value )
builder.filter(getDictionaryString(offset));
}
}
}
/**
* Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
*
* @param builder
* @param numInfoFields
*/
@Requires("numInfoFields >= 0")
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
if ( numInfoFields == 0 )
// fast path, don't bother doing any work if there are no fields
return;
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
for ( int i = 0; i < numInfoFields; i++ ) {
final String key = getDictionaryString();
Object value = decoder.decodeTypedValue();
@ -297,143 +399,98 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.attributes(infoFieldEntries);
}
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) {
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
final int nSamples = siteInfo.nSamples;
final int nFields = siteInfo.nFormatFields;
// --------------------------------------------------------------------------------
//
// Decoding Genotypes
//
// --------------------------------------------------------------------------------
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
/**
* Create the lazy loader for the genotypes data, and store it in the builder
* so that the VC will be able to decode on demand the genotypes data
*
* @param siteInfo
* @param builder
*/
private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
final VariantContextBuilder builder ) {
if (siteInfo.nSamples > 0) {
final LazyGenotypesContext.LazyParser lazyParser =
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
final int nGenotypes = header.getGenotypeSamples().size();
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
nGenotypes);
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
// all of the information we need for each genotype, with default values
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
double log10PError = VariantContext.NO_LOG10_PERROR;
Set<String> filters = null;
Map<String, Object> attributes = null;
double[] log10Likelihoods = null;
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )
lazy.decode();
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
Object value = entry.getValue().get(i);
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)value);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = ((Integer)value) / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
final double d = pls.get(j);
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
}
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( value != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
attributes.put(field, value);
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + value);
}
}
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
}
builder.genotypes(genotypes);
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
builder.genotypesNoValidation(lazy);
}
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
public static class LazyData {
final public int nGenotypeFields;
final public byte[] bytes;
if ( nFields == 0 ) // fast path exit for sites only file
return Collections.emptyMap();
else {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
@Requires({"nGenotypeFields > 0", "bytes != null"})
public LazyData(final int nGenotypeFields, final byte[] bytes) {
this.nGenotypeFields = nGenotypeFields;
this.bytes = bytes;
}
}
@Ensures("result != null")
private final String getDictionaryString() {
return getDictionaryString((Integer) decoder.decodeTypedValue());
}
private final String getDictionaryString(final int offset) {
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
final String field = dictionary.get(offset);
return field;
@Requires("offset < dictionary.size()")
@Ensures("result != null")
protected final String getDictionaryString(final int offset) {
return dictionary.get(offset);
}
/**
* Translate the config offset as encoded in the BCF file into the actual string
* name of the contig from the dictionary
*
* @param contigOffset
* @return
*/
@Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"})
@Ensures("result != null")
private final String lookupContigName( final int contigOffset ) {
if ( contigOffset < contigNames.size() ) {
return contigNames.get(contigOffset);
}
else {
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
}
return contigNames.get(contigOffset);
}
@Requires("header != null")
@Ensures({"result != null", "! result.isEmpty()"})
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dict.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
if ( dict.isEmpty() )
error("Dictionary header element was absent or empty");
return dict;
}
/**
* @return the VCFHeader we found in this BCF2 file
*/
protected VCFHeader getHeader() {
return header;
}
@Requires("field != null")
@Ensures("result != null")
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
return gtFieldDecoders.getDecoder(field);
}
private final void error(final String message) throws RuntimeException {
throw new UserException.MalformedBCF2(String.format("At record %d with position %d:", recordNo, pos, message));
}
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -33,12 +35,13 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
public class BCF2Decoder {
public final class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
byte[] recordBytes;
ByteArrayInputStream recordStream;
byte[] recordBytes = null;
ByteArrayInputStream recordStream = null;
public BCF2Decoder() {
// nothing to do
@ -66,6 +69,7 @@ public class BCF2Decoder {
* @return
*/
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
if ( blockSizeInBytes < 0 ) throw new UserException.MalformedBCF2("Invalid block size " + blockSizeInBytes);
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
}
@ -112,9 +116,9 @@ public class BCF2Decoder {
*
* @param recordBytes
*/
@Requires("recordBytes != null")
@Ensures({"this.recordBytes == recordBytes", "recordStream != null"})
public void setRecordBytes(final byte[] recordBytes) {
assert recordBytes != null;
this.recordBytes = recordBytes;
this.recordStream = new ByteArrayInputStream(recordBytes);
}
@ -131,7 +135,7 @@ public class BCF2Decoder {
}
public final Object decodeTypedValue(final byte typeDescriptor) {
final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor);
final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0;
@ -155,7 +159,7 @@ public class BCF2Decoder {
public final Object decodeSingleValue(final BCF2Type type) {
// TODO -- decodeTypedValue should integrate this routine
final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
final int value = decodeInt(type);
if ( value == type.getMissingBytes() )
return null;
@ -184,26 +188,107 @@ public class BCF2Decoder {
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
try {
recordStream.read(bytes);
final String s = new String(bytes);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
int goodLength = 0;
for ( ; goodLength < bytes.length ; goodLength++ )
if ( bytes[goodLength] == 0 ) break;
if ( goodLength == 0 )
return null;
else {
final String s = new String(bytes, 0, goodLength);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
}
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
private final int decodeVectorSize() {
final byte typeDescriptor = readTypeDescriptor();
final int size = BCF2Utils.decodeSize(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size == 1;
assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32;
return decodeInt(type.getSizeInBytes());
@Ensures("result >= 0")
public final int decodeNumberOfElements(final byte typeDescriptor) {
if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
// -1 ensures we explode immediately with a bad size if the result is missing
return decodeInt(readTypeDescriptor(), -1);
else
// the size is inline, so just decode it
return BCF2Utils.decodeSize(typeDescriptor);
}
public final int decodeInt(int bytesForEachInt) {
return BCF2Utils.readInt(bytesForEachInt, recordStream);
/**
* Decode an int from the stream. If the value in the stream is missing,
* returns missingValue. Requires the typeDescriptor indicate an inline
* single element event
*
* @param typeDescriptor
* @return
*/
@Requires("BCF2Utils.decodeSize(typeDescriptor) == 1")
public final int decodeInt(final byte typeDescriptor, final int missingValue) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int i = decodeInt(type);
return i == type.getMissingBytes() ? missingValue : i;
}
@Requires("type != null")
public final int decodeInt(final BCF2Type type) {
return BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
}
/**
* Low-level reader for int[]
*
* Requires a typeDescriptor so the function knows how many elements to read,
* and how they are encoded.
*
* If size == 0 => result is null
* If size > 0 => result depends on the actual values in the stream
* -- If the first element read is MISSING, result is null (all values are missing)
* -- Else result = int[N] where N is the first N non-missing values decoded
*
* @param maybeDest if not null we'll not allocate space for the vector, but instead use
* the externally allocated array of ints to store values. If the
* size of this vector is < the actual size of the elements, we'll be
* forced to use freshly allocated arrays. Also note that padded
* int elements are still forced to do a fresh allocation as well.
* @return see description
*/
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) {
return null;
} else {
if ( maybeDest != null && maybeDest.length < size )
maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small
final int val1 = decodeInt(type);
if ( val1 == type.getMissingBytes() ) {
// fast path for first element being missing
for ( int i = 1; i < size; i++ ) decodeInt(type);
return null;
} else {
// we know we will have at least 1 element, so making the int[] is worth it
final int[] ints = maybeDest == null ? new int[size] : maybeDest;
ints[0] = val1; // we already read the first one
for ( int i = 1; i < size; i++ ) {
ints[i] = decodeInt(type);
if ( ints[i] == type.getMissingBytes() ) {
// read the rest of the missing values, dropping them
for ( int j = i + 1; j < size; j++ ) decodeInt(type);
// deal with auto-pruning by returning an int[] containing
// only the non-MISSING values. We do this by copying the first
// i elements, as i itself is missing
return Arrays.copyOf(ints, i);
}
}
return ints; // all of the elements were non-MISSING
}
}
}
public final int[] decodeIntArray(final byte typeDescriptor) {
final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
return decodeIntArray(size, type, null);
}
public final double rawFloatToFloat(final int rawFloat) {

View File

@ -0,0 +1,282 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.util.*;
/**
* An efficient scheme for building and obtaining specialized
* genotype field decoders. Used by the BCFCodec to parse
* with little overhead the fields from BCF2 encoded genotype
* records
*
* @author Mark DePristo
* @since 6/12
*/
public class BCF2GenotypeFieldDecoders {
final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
private final static boolean ENABLE_FASTPATH_GT = true;
private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number
// initialized once per writer to allow parallel writers to work
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
private final Decoder defaultDecoder = new GenericDecoder();
public BCF2GenotypeFieldDecoders(final VCFHeader header) {
// TODO -- fill in appropriate decoders for each FORMAT field in the header
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
// currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder());
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder());
}
// -----------------------------------------------------------------
//
// Genotype field decoder
//
// -----------------------------------------------------------------
/**
* Return decoder appropriate for field, or the generic decoder if no
* specialized one is bound
* @param field the GT field to decode
* @return a non-null decoder
*/
@Requires("field != null")
@Ensures("result != null")
public Decoder getDecoder(final String field) {
final Decoder d = genotypeFieldDecoder.get(field);
return d == null ? defaultDecoder : d;
}
/**
* Decoder a field (implicit from creation) encoded as
* typeDescriptor in the decoder object in the GenotypeBuilders
* one for each sample in order.
*
* The way this works is that this decode method
* iterates over the builders, decoding a genotype field
* in BCF2 for each sample from decoder.
*
* This system allows us to easily use specialized
* decoders for specific genotype field values. For example,
* we use a special decoder to directly read the BCF2 data for
* the PL field into a int[] rather than the generic List of Integer
*/
public interface Decoder {
@Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
"field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
public void decode(final List<Allele> siteAlleles,
final String field,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs);
}
private class GTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
// we have to do a bit of low-level processing here as we want to know the size upfronta
final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
else {
generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
}
}
/**
* fast path for many samples with diploid genotypes
*
* The way this would work is simple. Create a List<Allele> diploidGenotypes[] object
* After decoding the offset, if that sample is diploid compute the
* offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
* if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
* cache it and use that
*
* Some notes. If there are nAlleles at the site, there are implicitly actually
* n + 1 options including
*/
@Requires("siteAlleles.size() == 2")
@SuppressWarnings({"unchecked"})
private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int nPossibleGenotypes = 3 * 3;
final Object allGenotypes[] = new Object[nPossibleGenotypes];
for ( final GenotypeBuilder gb : gbs ) {
final int a1 = decoder.decodeInt(type);
final int a2 = decoder.decodeInt(type);
if ( a1 == type.getMissingBytes() ) {
assert a2 == type.getMissingBytes();
// no called sample GT = .
gb.alleles(null);
} else if ( a2 == type.getMissingBytes() ) {
gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1)));
} else {
// downshift to remove phase
final int offset = (a1 >> 1) * 3 + (a2 >> 1);
assert offset < allGenotypes.length;
// TODO -- how can I get rid of this cast?
List<Allele> gt = (List<Allele>)allGenotypes[offset];
if ( gt == null ) {
final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1);
final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2);
gt = Arrays.asList(allele1, allele2);
allGenotypes[offset] = gt;
}
gb.alleles(gt);
}
}
}
private final void generalDecode(final List<Allele> siteAlleles,
final int ploidy,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
// a single cache for the encoded genotypes, since we don't actually need this vector
final int[] tmp = new int[ploidy];
for ( final GenotypeBuilder gb : gbs ) {
final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp);
if ( encoded == null )
// no called sample GT = .
gb.alleles(null);
else {
assert encoded.length > 0;
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.length);
// note that the auto-pruning of fields magically handles different
// ploidy per sample at a site
for ( final int encode : encoded )
gt.add(getAlleleFromEncoded(siteAlleles, encode));
gb.alleles(gt);
}
}
}
@Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"})
@Ensures("result != null")
private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) {
final int offset = encode >> 1;
return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1);
}
}
private class DPDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.DP(decoder.decodeInt(typeDescriptor, -1));
}
}
}
private class GQDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.GQ(decoder.decodeInt(typeDescriptor, -1));
}
}
}
private class ADDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.AD(decoder.decodeIntArray(typeDescriptor));
}
}
}
private class PLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.PL(decoder.decodeIntArray(typeDescriptor));
}
}
}
private class GenericDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
if ( value instanceof List && ((List)value).size() == 1) {
// todo -- I really hate this, and it suggests that the code isn't completely right
// the reason it's here is that it's possible to prune down a vector to a singleton
// value and there we have the contract that the value comes back as an atomic value
// not a vector of size 1
value = ((List)value).get(0);
}
gb.attribute(field, value);
}
}
}
}
private class FTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List<String>)value);
}
}
}
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Lazy version of genotypes decoder for BCF2 genotypes
*
* @author Mark DePristo
* @since 5/12
*/
class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);
// the essential information for us to use to decode the genotypes data
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
// and its stored here again for code cleanliness
private final BCF2Codec codec;
private final ArrayList<Allele> siteAlleles;
private final int nSamples;
private final int nFields;
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
this.codec = codec;
this.siteAlleles = alleles;
this.nSamples = nSamples;
this.nFields = nFields;
}
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
if ( logger.isDebugEnabled() )
logger.debug("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
// load our byte[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
// TODO -- fast path for sites only
// go ahead and decode everyone
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
// create and initialize the genotypes array
final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
builders.add(new GenotypeBuilder(samples.get(i)));
}
for ( int i = 0; i < nFields; i++ ) {
// get the field name
final int offset = (Integer) decoder.decodeTypedValue();
final String field = codec.getDictionaryString(offset);
// the type of each element
final byte typeDescriptor = decoder.readTypeDescriptor();
final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field);
try {
fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders);
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value");
}
}
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( final GenotypeBuilder gb : builders )
genotypes.add(gb.make());
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
}
}

View File

@ -1,143 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.*;
import java.util.*;
/**
* Testing BCF2
*
* @author Mark DePristo
* @since 2012
*/
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
/**
* Variants from this VCF file are used by this tool as input.
* The file must at least contain the standard VCF header lines, but
* can be empty (i.e., no variants are contained in the file).
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@Argument(doc="keep variants", required=false)
public boolean keepVariants = false;
@Argument(doc="quiet", required=false)
public boolean quiet = false;
@Argument(doc="dontIndexOnTheFly", required=false)
public boolean dontIndexOnTheFly = false;
@Output(doc="File to which results should be written",required=true)
protected File bcfFile;
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
protected VariantContextWriter writer;
@Override
public void initialize() {
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
try {
EnumSet<Options> options = EnumSet.of(Options.FORCE_BCF);
if ( !dontIndexOnTheFly ) options.add(Options.INDEX_ON_THE_FLY);
writer = VariantContextWriterFactory.create(bcfFile, new FileOutputStream(bcfFile), getToolkit().getMasterSequenceDictionary(), options);
writer.writeHeader(header);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
writer.add(vc);
if ( keepVariants ) vcs.add(vc);
}
return 1;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {
try {
writer.close();
logger.info("Closed writer");
// read in the BCF records
BCF2Codec codec = new BCF2Codec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
pbs.skip(header.getHeaderEnd());
Iterator<VariantContext> it = vcs.iterator();
while ( ! pbs.isDone() ) {
if ( keepVariants ) {
VariantContext expected = it.next();
if ( ! quiet )
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
}
VariantContext bcfRaw = codec.decode(pbs);
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
if ( ! quiet ) {
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
System.out.printf("--------------------------------------------------%n");
}
}
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
}
}
}

View File

@ -24,18 +24,22 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Requires;
import java.util.EnumSet;
/**
* BCF2 types and information
* BCF2 types and associated information
*
* @author depristo
* @since 05/12
*/
public enum BCF2Type {
INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE),
CHAR(7);
INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
INT16(2, 2, 0xFFFF8000, -32767, 32767),
INT32(3, 4, 0x80000000, -2147483647, 2147483647),
FLOAT(5, 4, 0x7F800001),
CHAR (7, 1, 0x00000000);
private final int id;
private final Object missingJavaValue;
@ -60,11 +64,53 @@ public enum BCF2Type {
this.maxValue = maxValue;
}
/**
* How many bytes are used to represent this type on disk?
* @return
*/
public int getSizeInBytes() {
return sizeInBytes;
}
/**
* The ID according to the BCF2 specification
* @return
*/
public int getID() { return id; }
/**
* Can we encode value v in this type, according to its declared range.
*
* Only makes sense for integer values
*
* @param v
* @return
*/
@Requires("INTEGERS.contains(this)")
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
/**
* Return the java object (aka null) that is used to represent a missing value for this
* type in Java
*
* @return
*/
public Object getMissingJavaValue() { return missingJavaValue; }
/**
* The bytes (encoded as an int) that are used to represent a missing value
* for this type in BCF2
*
* @return
*/
public int getMissingBytes() { return missingBytes; }
/**
* An enum set of the types that might represent Integer values
*/
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
public boolean isIntegerType() {
return INTEGERS.contains(this);
}
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
@ -33,9 +35,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.io.OutputStream;
import java.util.*;
/**
* Common utilities for working with BCF2 files
@ -45,7 +46,7 @@ import java.util.List;
* @author depristo
* @since 5/12
*/
public class BCF2Utils {
public final class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
@ -53,12 +54,6 @@ public class BCF2Utils {
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
public final static BCF2Type[] ID_TO_ENUM;
@ -77,11 +72,17 @@ public class BCF2Utils {
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
* fields.
*
* Note that its critical that the list be dedupped and sorted in a consistent manner each time,
* as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly
* the same way as in the header each time it's very bad
*
* @param header the VCFHeader from which to build the dictionary
* @return a non-null dictionary of elements, may be empty
*/
@Requires("header != null")
@Ensures({"result != null", "new HashSet(result).size() == result.size()"})
public final static ArrayList<String> makeDictionary(final VCFHeader header) {
final ArrayList<String> dict = new ArrayList<String>();
final Set<String> dict = new TreeSet<String>();
// set up the strings dictionary
dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field
@ -92,23 +93,27 @@ public class BCF2Utils {
}
}
return dict;
return new ArrayList<String>(dict);
}
@Requires({"nElements >= 0", "type != null"})
public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
return typeByte;
}
@Ensures("result >= 0")
public final static int decodeSize(final byte typeDescriptor) {
return (0xF0 & typeDescriptor) >> 4;
}
@Ensures("result >= 0")
public final static int decodeTypeID(final byte typeDescriptor) {
return typeDescriptor & 0x0F;
}
@Ensures("result != null")
public final static BCF2Type decodeType(final byte typeDescriptor) {
return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
}
@ -117,6 +122,7 @@ public class BCF2Utils {
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
}
@Requires("nElements >= 0")
public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS;
}
@ -128,6 +134,7 @@ public class BCF2Utils {
}
public final static byte readByte(final InputStream stream) {
// TODO -- shouldn't be capturing error here
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
@ -135,6 +142,7 @@ public class BCF2Utils {
}
}
@Requires({"stream != null", "bytesForEachInt > 0"})
public final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
@ -161,10 +169,10 @@ public class BCF2Utils {
* @param strings size > 1 list of strings
* @return
*/
@Requires({"strings != null", "strings.size() > 1"})
@Ensures("result != null")
public static final String collapseStringList(final List<String> strings) {
assert strings.size() > 1;
StringBuilder b = new StringBuilder();
final StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
@ -181,12 +189,15 @@ public class BCF2Utils {
* @param collapsed
* @return
*/
@Requires({"collapsed != null", "isCollapsedString(collapsed)"})
@Ensures("result != null")
public static final List<String> exploreStringList(final String collapsed) {
assert isCollapsedString(collapsed);
final String[] exploded = collapsed.substring(1).split(",");
return Arrays.asList(exploded);
}
@Requires("s != null")
public static final boolean isCollapsedString(final String s) {
return s.charAt(0) == ',';
}
@ -200,6 +211,8 @@ public class BCF2Utils {
* @param vcfFile
* @return
*/
@Requires("vcfFile != null")
@Ensures("result != null")
public static final File shadowBCF(final File vcfFile) {
final String path = vcfFile.getAbsolutePath();
if ( path.contains(".vcf") )
@ -207,4 +220,109 @@ public class BCF2Utils {
else
return new File( path + ".bcf" );
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Returns the maximum BCF2 integer size of t1 and t2
*
* For example, if t1 == INT8 and t2 == INT16 returns INT16
*
* @param t1
* @param t2
* @return
*/
@Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"})
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) {
case INT8: return t2;
case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
case INT32: return t1;
default: throw new ReviewedStingException("BUG: unexpected BCF2Type " + t1);
}
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
public final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
public final static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
switch ( type.getSizeInBytes() ) {
case 1:
encodeStream.write(0xFF & value);
break;
case 2:
encodeStream.write((0xFF00 & value) >> 8);
encodeStream.write(0xFF & value);
break;
case 4:
encodeStream.write((0xFF000000 & value) >> 24);
encodeStream.write((0x00FF0000 & value) >> 16);
encodeStream.write((0x0000FF00 & value) >> 8);
encodeStream.write((0x000000FF & value));
break;
default:
throw new ReviewedStingException("BUG: unexpected type size " + type);
}
// general case for reference
// for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
// final int shift = i * 8;
// int mask = 0xFF << shift;
// int byteValue = (mask & value) >> shift;
// encodeStream.write(byteValue);
// }
}
}

View File

@ -28,6 +28,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
// we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null;
protected VCFHeaderVersion version = null;
// a mapping of the allele
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
@ -48,7 +49,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected final String[] locParts = new String[6];
// for performance we cache the hashmap of filter encodings for quick lookup
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
protected HashMap<String,List<String>> filterHash = new HashMap<String,List<String>>();
// we store a name to give to each of the variant contexts we emit
protected String name = "Unknown";
@ -91,24 +92,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
*/
public abstract Object readHeader(LineReader reader);
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
* @param filterString the string to parse
* @return a set of the filters applied
*/
protected abstract Set<String> parseFilters(String filterString);
protected abstract List<String> parseFilters(String filterString);
/**
* create a VCF header from a set of header record lines
@ -117,6 +106,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
* @return a VCFHeader object
*/
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
this.version = version;
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> sampleNames = new LinkedHashSet<String>();
int contigCounter = 0;
@ -320,7 +311,9 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
String ref = getCachedString(parts[3].toUpperCase());
String alts = getCachedString(parts[4].toUpperCase());
builder.log10PError(parseQual(parts[5]));
builder.filters(parseFilters(getCachedString(parts[6])));
final List<String> filters = parseFilters(getCachedString(parts[6]));
if ( filters != null ) builder.filters(new HashSet<String>(filters));
final Map<String, Object> attrs = parseInfo(parts[7]);
builder.attributes(attrs);
@ -719,4 +712,115 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
try { stream.close(); } catch ( IOException e ) {}
}
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
final List<Allele> alleles,
final String chr,
final int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
final String sampleName = sampleNameIterator.next();
final GenotypeBuilder gb = new GenotypeBuilder(sampleName);
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gb.maxAttributes(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = genotypeKeyArray[i];
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if ( missing ) {
// if its truly missing (there no provided value) skip adding it to the attributes
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
final List<String> filters = parseFilters(getCachedString(GTValueArray[i]));
if ( filters != null ) gb.filters(filters);
} else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) {
// don't add missing values to the map
} else {
if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
if ( GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) )
gb.noGQ();
else
gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i])));
} else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
gb.AD(decodeInts(GTValueArray[i]));
} else if (gtKey.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)) {
gb.PL(decodeInts(GTValueArray[i]));
} else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs());
} else if (gtKey.equals(VCFConstants.DEPTH_KEY)) {
gb.DP(Integer.valueOf(GTValueArray[i]));
} else {
gb.attribute(gtKey, GTValueArray[i]);
}
}
}
}
// check to make sure we found a genotype field if our version is less than 4.1 file
if ( version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
gb.alleles(GTalleles);
gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1);
// add it to the list
try {
genotypes.add(gb.make());
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
private final static String[] INT_DECODE_ARRAY = new String[10000];
private final static int[] decodeInts(final String string) {
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
final int[] values = new int[nValues];
for ( int i = 0; i < nValues; i++ )
values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]);
return values;
}
}

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
@ -78,24 +102,24 @@ public class VCF3Codec extends AbstractVCFCodec {
* @param filterString the string to parse
* @return a set of the filters applied
*/
protected Set<String> parseFilters(String filterString) {
protected List<String> parseFilters(String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
List<String> fFields = new ArrayList<String>();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
return fFields;
return new ArrayList<String>(fFields);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status");
// do we have the filter string cached?
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
return new ArrayList<String>(filterHash.get(filterString));
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
@ -108,93 +132,6 @@ public class VCF3Codec extends AbstractVCFCodec {
return fFields;
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_LOG10_PERROR;
Set<String> genotypeFilters = null;
Map<String, Object> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, Object>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) {
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
} else {
gtAttributes.put(gtKey, new String(GTValueArray[i]));
}
}
}
// check to make sure we found a genotype field
if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.add(new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
}
@Override
public boolean canDecode(final String potentialInput) {
return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER);

View File

@ -48,7 +48,6 @@ import java.util.*;
public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
private VCFHeaderVersion version = null;
/**
* A VCF header the contains master info/filter/format records that we use to 'fill in'
@ -127,121 +126,33 @@ public class VCFCodec extends AbstractVCFCodec {
* @param filterString the string to parse
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/
protected Set<String> parseFilters(String filterString) {
return parseFilters(filterHash, lineNo, filterString);
}
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
protected List<String> parseFilters(String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
return Collections.emptySet();
return Collections.emptyList();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached?
if ( cache != null && cache.containsKey(filterString) )
return Collections.unmodifiableSet(cache.get(filterString));
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
List<String> fFields = new LinkedList<String>();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
fFields = fFields;
if ( cache != null ) cache.put(filterString, fFields);
filterHash.put(filterString, Collections.unmodifiableList(fFields));
return Collections.unmodifiableSet(fFields);
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_LOG10_PERROR;
Set<String> genotypeFilters = null;
Map<String, Object> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, Object>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing ) {
// if its truly missing (there no provided value) skip adding it to the attributes
} else {
gtAttributes.put(gtKey, GTValueArray[i]);
}
}
}
// check to make sure we found a genotype field if we are a VCF4.0 file
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.add(new Genotype(sampleName, GTalleles, GTQual, genotypeFilters, gtAttributes, phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
return fFields;
}
@Override

View File

@ -56,8 +56,9 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
public String getDescription() { return description; }
public VCFHeaderLineType getType() { return type; }
public VCFHeaderLineCount getCountType() { return countType; }
public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; }
public int getCount() {
if ( countType != VCFHeaderLineCount.INTEGER )
if ( ! isFixedCount() )
throw new ReviewedStingException("Asking for header line count when type is not an integer");
return count;
}

View File

@ -48,6 +48,7 @@ public final class VCFConstants {
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods
public static final String GENOTYPE_POSTERIORS_KEY = "GP";
public static final String GENOTYPE_QUALITY_KEY = "GQ";
public static final String GENOTYPE_ALLELE_DEPTHS = "AD";
public static final String HAPMAP2_KEY = "H2";
public static final String HAPMAP3_KEY = "H3";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
@ -113,7 +114,5 @@ public final class VCFConstants {
public static final String EMPTY_GENOTYPE = "./.";
public static final int MAX_GENOTYPE_QUAL = 99;
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00";
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
}

Some files were not shown because too many files have changed in this diff Show More