Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2011-12-13 15:59:52 -05:00
commit 7a386b45a5
42 changed files with 2126 additions and 1103 deletions

View File

@ -711,6 +711,8 @@ public class SAMDataSource {
* @param validationStringency validation stringency.
*/
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
int totalNumberOfFiles = readerIDs.size();
int readerNumber = 1;
for(SAMReaderID readerID: readerIDs) {
File indexFile = findIndexFile(readerID.samFile);
@ -728,8 +730,7 @@ public class SAMDataSource {
reader.enableFileSource(true);
reader.setValidationStringency(validationStringency);
final SAMFileHeader header = reader.getFileHeader();
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile));
readers.put(readerID,reader);
}

View File

@ -142,20 +142,75 @@ public class SampleDB {
* @return
*/
public final Map<String, Set<Sample>> getFamilies() {
return getFamilies(null);
}
/**
* Returns a map from family ID -> set of family members for all samples in sampleIds with
* non-null family ids
*
* @param sampleIds - all samples to include. If null is passed then all samples are returned.
* @return
*/
public final Map<String, Set<Sample>> getFamilies(Collection<String> sampleIds) {
final Map<String, Set<Sample>> families = new TreeMap<String, Set<Sample>>();
for ( final Sample sample : samples.values() ) {
final String famID = sample.getFamilyID();
if ( famID != null ) {
if ( ! families.containsKey(famID) )
families.put(famID, new TreeSet<Sample>());
families.get(famID).add(sample);
if(sampleIds == null || sampleIds.contains(sample.getID())){
final String famID = sample.getFamilyID();
if ( famID != null ) {
if ( ! families.containsKey(famID) )
families.put(famID, new TreeSet<Sample>());
families.get(famID).add(sample);
}
}
}
return families;
}
/**
* Returns the set of all children that have both of their parents.
* Note that if a family is composed of more than 1 child, each child is
* returned.
* @return - all the children that have both of their parents
*/
public final Set<Sample> getChildrenWithParents(){
return getChildrenWithParents(false);
}
/**
* Returns the set of all children that have both of their parents.
* Note that if triosOnly = false, a family is composed of more than 1 child, each child is
* returned.
*
* This method can be used wherever trios are needed
*
* @param triosOnly - if set to true, only strict trios are returned
* @return - all the children that have both of their parents
*/
public final Set<Sample> getChildrenWithParents(boolean triosOnly) {
Map<String, Set<Sample>> families = getFamilies();
final Set<Sample> childrenWithParents = new HashSet<Sample>();
Iterator<Sample> sampleIterator;
for ( Set<Sample> familyMembers: families.values() ) {
if(triosOnly && familyMembers.size() != 3)
continue;
sampleIterator = familyMembers.iterator();
Sample sample;
while(sampleIterator.hasNext()){
sample = sampleIterator.next();
if(sample.getParents().size() == 2 && familyMembers.containsAll(sample.getParents()))
childrenWithParents.add(sample);
}
}
return childrenWithParents;
}
/**
* Return all samples with a given family ID
* @param familyId

View File

@ -88,7 +88,7 @@ public abstract class Walker<MapType, ReduceType> {
return getToolkit().getMasterSequenceDictionary();
}
protected SampleDB getSampleDB() {
public SampleDB getSampleDB() {
return getToolkit().getSampleDB();
}

View File

@ -3,22 +3,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* Created by IntelliJ IDEA.
@ -30,23 +26,26 @@ import java.util.Map;
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation {
private MendelianViolation mendelianViolation = null;
private String motherId;
private String fatherId;
private String childId;
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( mendelianViolation == null ) {
if ( walker instanceof VariantAnnotator && ((VariantAnnotator) walker).familyStr != null) {
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP );
if (checkAndSetSamples(((VariantAnnotator) walker).getSampleDB())) {
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
else {
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid Family String file (-family) on the command line.");
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio.");
}
}
Map<String,Object> toRet = new HashMap<String,Object>(1);
boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) && vc.getGenotype(mendelianViolation.getSampleChild()).hasLikelihoods() &&
vc.hasGenotype(mendelianViolation.getSampleDad()) && vc.getGenotype(mendelianViolation.getSampleDad()).hasLikelihoods() &&
vc.hasGenotype(mendelianViolation.getSampleMom()) && vc.getGenotype(mendelianViolation.getSampleMom()).hasLikelihoods();
boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() &&
vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() &&
vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods();
if ( hasAppropriateGenotypes )
toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc));
toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId));
return toRet;
}
@ -55,4 +54,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
public List<String> getKeyNames() { return Arrays.asList("MVLR"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
private boolean checkAndSetSamples(SampleDB db){
Set<String> families = db.getFamilyIDs();
if(families.size() != 1)
return false;
Set<Sample> family = db.getFamily(families.iterator().next());
if(family.size() != 3)
return false;
Iterator<Sample> sampleIter = family.iterator();
Sample sample;
for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){
if(sample.getParents().size()==2){
motherId = sample.getMaternalID();
fatherId = sample.getPaternalID();
childId = sample.getID();
return true;
}
}
return false;
}
}

View File

@ -38,7 +38,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
for ( final Genotype genotype : genotypes ) {
// we care only about variant calls with likelihoods
if ( genotype.isHomRef() )
if ( !genotype.isHet() && !genotype.isHomVar() )
continue;
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());

View File

@ -12,10 +12,8 @@ import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.FileNotFoundException;
import java.util.*;
/**
@ -26,42 +24,33 @@ import java.util.*;
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation {
private Set<MendelianViolation> fullMVSet = null;
private Set<Sample> trios = null;
private final static int REF = 0;
private final static int HET = 1;
private final static int HOM = 2;
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( fullMVSet == null ) {
fullMVSet = new HashSet<MendelianViolation>();
if ( trios == null ) {
if ( walker instanceof VariantAnnotator ) {
final Map<String,Set<Sample>> families = ((VariantAnnotator) walker).getSampleDB().getFamilies();
for( final Set<Sample> family : families.values() ) {
for( final Sample sample : family ) {
if( sample.getParents().size() == 2 && family.containsAll(sample.getParents()) ) { // only works with trios for now
fullMVSet.add( new MendelianViolation(sample, 0.0) );
}
}
}
trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents();
} else {
throw new UserException("Transmission disequilibrium test annotation can only be used from the Variant Annotator and requires a valid ped file be passed in.");
}
}
final Map<String,Object> toRet = new HashMap<String,Object>(1);
final HashSet<MendelianViolation> mvsToTest = new HashSet<MendelianViolation>();
final HashSet<Sample> triosToTest = new HashSet<Sample>();
for( final MendelianViolation mv : fullMVSet ) {
final boolean hasAppropriateGenotypes = vc.hasGenotype(mv.getSampleChild()) && vc.getGenotype(mv.getSampleChild()).hasLikelihoods() &&
vc.hasGenotype(mv.getSampleDad()) && vc.getGenotype(mv.getSampleDad()).hasLikelihoods() &&
vc.hasGenotype(mv.getSampleMom()) && vc.getGenotype(mv.getSampleMom()).hasLikelihoods();
for( final Sample child : trios) {
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
if ( hasAppropriateGenotypes ) {
mvsToTest.add(mv);
triosToTest.add(child);
}
}
toRet.put("TDT", calculateTDT( vc, mvsToTest ));
toRet.put("TDT", calculateTDT( vc, triosToTest ));
return toRet;
}
@ -72,27 +61,27 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", 1, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); }
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
private double calculateTDT( final VariantContext vc, final Set<MendelianViolation> mvsToTest ) {
private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
final double nABGivenABandBB = calculateNChildren(vc, mvsToTest, HET, HET, HOM);
final double nBBGivenABandBB = calculateNChildren(vc, mvsToTest, HOM, HET, HOM);
final double nAAGivenABandAB = calculateNChildren(vc, mvsToTest, REF, HET, HET);
final double nBBGivenABandAB = calculateNChildren(vc, mvsToTest, HOM, HET, HET);
final double nAAGivenAAandAB = calculateNChildren(vc, mvsToTest, REF, REF, HET);
final double nABGivenAAandAB = calculateNChildren(vc, mvsToTest, HET, REF, HET);
final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM);
final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM);
final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET);
final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET);
final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET);
final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET);
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
return (numer * numer) / denom;
}
private double calculateNChildren( final VariantContext vc, final Set<MendelianViolation> mvsToTest, final int childIdx, final int momIdx, final int dadIdx ) {
final double likelihoodVector[] = new double[mvsToTest.size() * 2];
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int momIdx, final int dadIdx ) {
final double likelihoodVector[] = new double[triosToTest.size() * 2];
int iii = 0;
for( final MendelianViolation mv : mvsToTest ) {
final double[] momGL = vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsVector();
final double[] dadGL = vc.getGenotype(mv.getSampleDad()).getLikelihoods().getAsVector();
final double[] childGL = vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsVector();
for( final Sample child : triosToTest ) {
final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx];
likelihoodVector[iii++] = momGL[dadIdx] + dadGL[momIdx] + childGL[childIdx];
}

View File

@ -167,9 +167,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
@Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation")
public String familyStr = null;
@Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio")
public double minGenotypeQualityP = 0.0;

View File

@ -52,7 +52,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
protected enum GenotypeType { AA, AB, BB }
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
this.N = N;
@ -62,24 +62,14 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
/**
* Must be overridden by concrete subclasses
* @param GLs genotype likelihoods
* @param Alleles Alleles corresponding to GLs
* @param log10AlleleFrequencyPriors priors
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
* @param GLs genotype likelihoods
* @param Alleles Alleles corresponding to GLs
* @param log10AlleleFrequencyPriors priors
* @param log10AlleleFrequencyLikelihoods array (pre-allocated) to store likelihoods results
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store posteriors results
*/
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors);
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
protected abstract GenotypesContext assignGenotypes(VariantContext vc,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood);
double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors);
}

View File

@ -1,94 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.variantcontext.Allele;
public class BiallelicGenotypeLikelihoods {
private String sample;
private double[] GLs;
private Allele A, B;
private int depth;
/**
* Create a new object for sample with given alleles and genotype likelihoods
*
* @param sample sample name
* @param A allele A
* @param B allele B
* @param log10AALikelihoods AA likelihoods
* @param log10ABLikelihoods AB likelihoods
* @param log10BBLikelihoods BB likelihoods
* @param depth the read depth used in creating the likelihoods
*/
public BiallelicGenotypeLikelihoods(String sample,
Allele A,
Allele B,
double log10AALikelihoods,
double log10ABLikelihoods,
double log10BBLikelihoods,
int depth) {
this.sample = sample;
this.A = A;
this.B = B;
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
this.depth = depth;
}
public String getSample() {
return sample;
}
public double getAALikelihoods() {
return GLs[0];
}
public double getABLikelihoods() {
return GLs[1];
}
public double getBBLikelihoods() {
return GLs[2];
}
public double[] getLikelihoods() {
return GLs;
}
public Allele getAlleleA() {
return A;
}
public Allele getAlleleB() {
return B;
}
public int getDepth() {
return depth;
}
}

View File

@ -27,13 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Aug 4, 2009
* Time: 6:46:09 PM
* To change this template use File | Settings | File Templates.
*/
public enum DiploidGenotype {
AA ('A', 'A'),
AC ('A', 'C'),
@ -110,6 +103,20 @@ public enum DiploidGenotype {
return conversionMatrix[index1][index2];
}
/**
* create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly
* @param baseIndex1 base1
* @param baseIndex2 base2
* @return the diploid genotype
*/
public static DiploidGenotype createDiploidGenotype(int baseIndex1, int baseIndex2) {
if ( baseIndex1 == -1 )
throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character");
if ( baseIndex2 == -1 )
throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character");
return conversionMatrix[baseIndex1][baseIndex2];
}
private static final DiploidGenotype[][] conversionMatrix = {
{ DiploidGenotype.AA, DiploidGenotype.AC, DiploidGenotype.AG, DiploidGenotype.AT },
{ DiploidGenotype.AC, DiploidGenotype.CC, DiploidGenotype.CG, DiploidGenotype.CT },

View File

@ -27,80 +27,43 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
//
// code for testing purposes
//
private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
private final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
public void getLog10PNonRef(final GenotypesContext GLs,
final List<Allele> alleles,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors) {
final int numAlleles = alleles.size();
final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null;
final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null;
int idxDiag = numAlleles;
int incr = numAlleles - 1;
for (int k=1; k < numAlleles; k++) {
// multi-allelic approximation, part 1: Ideally
// for each alt allele compute marginal (suboptimal) posteriors -
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
// as a row-wise upper triangular matrix of likelihoods.
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
final int idxAA = 0;
final int idxAB = k;
// yy is always element on the diagonal.
// 2 alleles: BBelement 2
// 3 alleles: BB element 3. CC element 5
// 4 alleles:
final int idxBB = idxDiag;
idxDiag += incr--;
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
if (numAlleles > 2) {
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
}
}
if (numAlleles > 2) {
// multiallelic approximation, part 2:
// report posteriors for allele that has highest estimated AC
int mostLikelyAlleleIdx = MathUtils.maxElementIndex(bestAFguess);
for (int k=0; k < log10AlleleFrequencyPosteriors.length-1; k++)
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
}
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
}
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(); // TODO -- initialize with size of GLs
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
if ( sample.hasLikelihoods() ) {
double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
@ -109,9 +72,401 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
final static double approximateLog10SumLog10(double[] vals) {
if ( vals.length < 2 )
throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10");
double approx = approximateLog10SumLog10(vals[0], vals[1]);
for ( int i = 2; i < vals.length; i++ )
approx = approximateLog10SumLog10(approx, vals[i]);
return approx;
}
final static double approximateLog10SumLog10(double small, double big) {
// make sure small is really the smaller value
if ( small > big ) {
final double t = big;
big = small;
small = t;
}
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
return big;
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
return big + MathUtils.jacobianLogTable[ind];
}
// -------------------------------------------------------------------------------------
//
// Linearized, ~O(N), implementation.
// Multi-allelic implementation.
//
// -------------------------------------------------------------------------------------
private static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
// a wrapper around the int array so that we can make it hashable
private static final class ExactACcounts {
private final int[] counts;
private int hashcode = -1;
public ExactACcounts(final int[] counts) {
this.counts = counts;
}
public int[] getCounts() {
return counts;
}
@Override
public boolean equals(Object obj) {
return (obj instanceof ExactACcounts) ? Arrays.equals(counts, ((ExactACcounts)obj).counts) : false;
}
@Override
public int hashCode() {
if ( hashcode == -1 )
hashcode = Arrays.hashCode(counts);
return hashcode;
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append(counts[0]);
for ( int i = 1; i < counts.length; i++ ) {
sb.append("/");
sb.append(counts[i]);
}
return sb.toString();
}
}
// This class represents a column in the Exact AC calculation matrix
private static final class ExactACset {
// the counts of the various alternate alleles which this column represents
final ExactACcounts ACcounts;
// the column of the matrix
final double[] log10Likelihoods;
// mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column;
// for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB.
final HashMap<ExactACcounts, Integer> ACsetIndexToPLIndex = new HashMap<ExactACcounts, Integer>();
// to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them
final ArrayList<ExactACcounts> dependentACsetsToDelete = new ArrayList<ExactACcounts>();
public ExactACset(final int size, final ExactACcounts ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
}
// sum of all the non-reference alleles
public int getACsum() {
int sum = 0;
for ( int count : ACcounts.getCounts() )
sum += count;
return sum;
}
public boolean equals(Object obj) {
return (obj instanceof ExactACset) ? ACcounts.equals(((ExactACset)obj).ACcounts) : false;
}
}
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors,
final boolean preserveData) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
// add AC=0 to the queue
int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
// adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs);
}
}
private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double maxLog10L,
final int numChr,
final boolean preserveData,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors) {
if ( DEBUG )
System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
// clean up memory
if ( !preserveData ) {
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
indexesToACset.put(index, null);
if ( DEBUG )
System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
}
}
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( DEBUG )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
// no reason to keep this data around because nothing depends on it
if ( !preserveData )
indexesToACset.put(set.ACcounts, null);
return log10LofK;
}
// iterate over higher frequencies if possible
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
final int numAltAlleles = set.ACcounts.getCounts().length;
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
// add conformations for the k+1 case
int PLindex = 0;
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset);
}
}
}
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
if ( !preserveData && lastSet == null ) {
if ( DEBUG )
System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
}
if ( lastSet != null )
lastSet.dependentACsetsToDelete.add(set.ACcounts);
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also adds it as a dependency to the given callingSetIndex.
// returns the ExactACset if that set was not already in the queue and null otherwise.
private static ExactACset updateACset(final int[] ACcounts,
final int numChr,
final ExactACset callingSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final ExactACcounts index = new ExactACcounts(ACcounts);
boolean wasInQueue = true;
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
wasInQueue = false;
}
// add the given dependency to the set
final ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
return wasInQueue ? null : set;
}
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue<ExactACset> ACqueue) {
ExactACset set = null;
for ( ExactACset queued : ACqueue ) {
if ( queued.dependentACsetsToDelete.contains(callingSetIndex) )
set = queued;
}
return set;
}
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyLikelihoods,
final double[][] log10AlleleFrequencyPosteriors) {
set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( totalK == 0 ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
}
// k > 0 for at least one k
else {
// all possible likelihoods for a given cell from which to choose the max
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
// initialize
for ( int i = 0; i < numPaths; i++ )
// TODO -- Arrays.fill?
// todo -- is this even necessary? Why not have as else below?
log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY;
// deal with the AA case first
if ( totalK < 2*j-1 )
log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
// deal with the other possible conformations now
if ( totalK <= 2*j ) { // skip impossible conformations
int conformationIndex = 1;
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
if ( DEBUG )
System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
log10ConformationLikelihoods[conformationIndex++] =
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
}
}
final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
// finally, update the L(j,k) value
set.log10Likelihoods[j] = log10Max - logDenominator;
}
}
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// determine the power of theta to use
int nonRefAlleles = 0;
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
if ( set.ACcounts.getCounts()[i] > 0 )
nonRefAlleles++;
}
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
int AC = set.ACcounts.getCounts()[i];
log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
// for k=0 we still want to use theta
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
}
}
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// todo -- arent' there a small number of fixed values that this function can adopt?
// todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance
// todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient
// todo -- this can be computed once at the start of the all operations
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
final int numAltAlleles = ACcounts.length;
// the AX het case
if ( PLindex <= numAltAlleles )
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
int subtractor = numAltAlleles+1;
int subtractions = 0;
do {
PLindex -= subtractor;
subtractor--;
subtractions++;
}
while ( PLindex >= subtractor );
final int k_i = ACcounts[subtractions-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( PLindex == 0 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[subtractions+PLindex-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
return coeff;
}
// -------------------------------------------------------------------------------------
//
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
//
// -------------------------------------------------------------------------------------
@ -119,6 +474,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
/*
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
@ -154,7 +510,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(GenotypesContext GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
@ -171,7 +528,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA];
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
@ -184,14 +541,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[idxAA];
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[idxAB];
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[idxBB];
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
@ -205,7 +562,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
@ -222,202 +580,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
final static double approximateLog10SumLog10(double a, double b, double c) {
//return softMax(new double[]{a, b, c});
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
}
*/
final static double approximateLog10SumLog10(double small, double big) {
// make sure small is really the smaller value
if ( small > big ) {
final double t = big;
big = small;
small = t;
}
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
return big;
if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE)
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
return big + MathUtils.jacobianLogTable[ind];
}
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
public GenotypesContext assignGenotypes(VariantContext vc,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood) {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
GenotypesContext GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
ArrayList<String> sampleIndices = new ArrayList<String>();
int sampleIdx = 0;
// todo - optimize initialization
for (int k=0; k <= AFofMaxLikelihood; k++)
for (int j=0; j <= GLs.size(); j++)
pathMetricArray[j][k] = -1e30;
pathMetricArray[0][0] = 0.0;
// todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
sampleIdx = GLs.size();
}
else {
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
if ( !genotype.hasLikelihoods() )
continue;
double[] likelihoods = genotype.getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) {
//System.out.print(sample.getKey()+":");
//for (int k=0; k < likelihoods.length; k++)
// System.out.format("%4.2f ",likelihoods[k]);
//System.out.println();
// all likelihoods are essentially the same: skip this sample and will later on force no call.
//sampleIdx++;
continue;
}
sampleIndices.add(genotype.getSampleName());
for (int k=0; k <= AFofMaxLikelihood; k++) {
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
int bestIndex = k;
if (k>0) {
double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1];
if (m2 > bestMetric) {
bestMetric = m2;
bestIndex = k-1;
}
}
if (k>1) {
double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2];
if (m2 > bestMetric) {
bestMetric = m2;
bestIndex = k-2;
}
}
pathMetricArray[sampleIdx+1][k] = bestMetric;
tracebackArray[sampleIdx+1][k] = bestIndex;
}
sampleIdx++;
}
}
GenotypesContext calls = GenotypesContext.create();
int startIdx = AFofMaxLikelihood;
for (int k = sampleIdx; k > 0; k--) {
int bestGTguess;
String sample = sampleIndices.get(k-1);
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
// if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now,
// and will add no-call genotype to GL's in a second pass
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double[] likelihoods = g.getLikelihoods().getAsVector();
if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) {
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
}
else {
int newIdx = tracebackArray[k][startIdx];;
bestGTguess = startIdx - newIdx;
startIdx = newIdx;
}
// likelihoods are stored row-wise in lower triangular matrix. IE
// for 2 alleles they have ordering AA,AB,BB
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
// Get now alleles corresponding to best index
int kk=0;
boolean done = false;
for (int j=0; j < vc.getNAlleles(); j++) {
for (int i=0; i <= j; i++){
if (kk++ == bestGTguess) {
if (i==0)
myAlleles.add(vc.getReference());
else
myAlleles.add(vc.getAlternateAllele(i-1));
if (j==0)
myAlleles.add(vc.getReference());
else
myAlleles.add(vc.getAlternateAllele(j-1));
done = true;
break;
}
}
if (done)
break;
}
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods);
//System.out.println(myAlleles.toString());
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
}
for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) {
if ( !genotype.hasLikelihoods() )
continue;
final Genotype g = GLs.get(genotype.getSampleName());
final double[] likelihoods = genotype.getLikelihoods().getAsVector();
if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL)
continue; // regular likelihoods
final double qual = Genotype.NO_LOG10_PERROR;
calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false));
}
return calls;
}
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
int j = logYMatrix.length - 1;
System.out.printf("-----------------------------------%n");
for (int k=0; k <= numChr; k++) {
double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
}
}
}

View File

@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Map;
@ -79,19 +80,17 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param priors priors to use for GLs
* @param GLs hash of sample->GL to fill in
* @param alternateAlleleToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @return genotype likelihoods per sample for AA, AB, BB
* @return variant context where genotypes are no-called but with GLs
*/
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map<String, MultiallelicGenotypeLikelihoods> GLs,
Allele alternateAlleleToUse,
boolean useBAQedPileup);
public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup);
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -41,8 +42,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
@ -243,7 +243,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// get deletion length
int dLen = Integer.valueOf(bestAltAllele.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart());
int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart();
//System.out.println(new String(ref.getBases()));
byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen);
@ -270,19 +270,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
public Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map<String, MultiallelicGenotypeLikelihoods> GLs,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
if ( tracker == null )
return null;
GenomeLoc loc = ref.getLocus();
Allele refAllele, altAllele;
VariantContext vc = null;
@ -368,10 +366,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases);
// start making the VariantContext
final int endLoc = calculateEndPos(alleleList, refAllele, loc);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase());
// create the genotypes; no-call everyone for now
GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
// For each sample, get genotype likelihoods based on pileup
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
// initialize the GenotypeLikelihoods
GLs.clear();
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
@ -384,11 +389,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (pileup != null ) {
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,
genotypeLikelihoods,
getFilteredDepth(pileup)));
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString());
@ -399,9 +405,25 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
}
return refAllele;
return builder.genotypes(genotypes).make();
}
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// for indels, stop location is one more than ref allele length
boolean hasNullAltAllele = false;
for ( Allele a : alleles ) {
if ( a.isNull() ) {
hasNullAltAllele = true;
break;
}
}
int endLoc = loc.getStart() + refAllele.length();
if( !hasNullAltAllele )
endLoc--;
return endLoc;
}
public static HashMap<PileupElement,LinkedHashMap<Allele,Double>> getIndelLikelihoodMap() {
return indelLikelihoodMap.get();

View File

@ -1,52 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 6/1/11
* Time: 10:38 AM
* To change this template use File | Settings | File Templates.
*/
public class MultiallelicGenotypeLikelihoods {
private String sample;
private double[] GLs;
private List<Allele> alleleList;
private int depth;
public MultiallelicGenotypeLikelihoods(String sample,
List<Allele> A,
double[] log10Likelihoods, int depth) {
/* Check for consistency between likelihood vector and number of alleles */
int numAlleles = A.size();
if (log10Likelihoods.length != numAlleles*(numAlleles+1)/2)
throw new StingException(("BUG: Incorrect length of GL vector when creating MultiallelicGenotypeLikelihoods object!"));
this.sample = sample;
this.alleleList = A;
this.GLs = log10Likelihoods;
this.depth = depth;
}
public String getSample() {
return sample;
}
public double[] getLikelihoods() {
return GLs;
}
public List<Allele> getAlleles() {
return alleleList;
}
public int getDepth() {
return depth;
}
}

View File

@ -31,107 +31,147 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.*;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
// the alternate allele with the largest sum of quality scores
protected Byte bestAlternateAllele = null;
private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50;
private boolean ALLOW_MULTIPLE_ALLELES;
private final boolean useAlleleFromVCF;
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC;
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
}
public Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map<String, MultiallelicGenotypeLikelihoods> GLs,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
public VariantContext getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
byte refBase = ref.getBase();
Allele refAllele = Allele.create(refBase, true);
final boolean[] basesToUse = new boolean[4];
final byte refBase = ref.getBase();
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
// find the alternate allele with the largest sum of quality scores
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(refBase, true));
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// find the alternate allele(s) that we should be using
if ( alternateAlleleToUse != null ) {
bestAlternateAllele = alternateAlleleToUse.getBases()[0];
basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true;
} else if ( useAlleleFromVCF ) {
VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a variant
if ( vc == null )
// ignore places where we don't have a SNP
if ( vc == null || !vc.isSNP() )
return null;
if ( !vc.isBiallelic() ) {
// for multi-allelic sites go back to the reads and find the most likely alternate allele
initializeBestAlternateAllele(refBase, contexts, useBAQedPileup);
} else {
bestAlternateAllele = vc.getAlternateAllele(0).getBases()[0];
}
for ( Allele allele : vc.getAlternateAlleles() )
basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true;
} else {
initializeBestAlternateAllele(refBase, contexts, useBAQedPileup);
determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup);
// how many alternate alleles are we using?
int alleleCounter = countSetBits(basesToUse);
// if there are no non-ref alleles...
if ( alleleCounter == 0 ) {
// if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
return builder.make();
// otherwise, choose any alternate allele (it doesn't really matter)
basesToUse[indexOfRefBase == 0 ? 1 : 0] = true;
}
}
// if there are no non-ref bases...
if ( bestAlternateAllele == null ) {
// if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
return refAllele;
// otherwise, choose any alternate allele (it doesn't really matter)
bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C');
// create the alternate alleles and the allele ordering (the ordering is crucial for the GLs)
final int numAltAlleles = countSetBits(basesToUse);
final int[] alleleOrdering = new int[numAltAlleles + 1];
alleleOrdering[0] = indexOfRefBase;
int alleleOrderingIndex = 1;
int numLikelihoods = 1;
for ( int i = 0; i < 4; i++ ) {
if ( i != indexOfRefBase && basesToUse[i] ) {
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false));
alleleOrdering[alleleOrderingIndex++] = i;
numLikelihoods += alleleOrderingIndex;
}
}
builder.alleles(alleles);
Allele altAllele = Allele.create(bestAlternateAllele, false);
// create the genotypes; no-call everyone for now
GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
if( useBAQedPileup ) { pileup = createBAQedPileup( pileup ); }
if ( useBAQedPileup )
pileup = createBAQedPileup( pileup );
// create the GenotypeLikelihoods object
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
if ( nGoodBases == 0 )
continue;
double[] likelihoods = GL.getLikelihoods();
final double[] allLikelihoods = GL.getLikelihoods();
final double[] myLikelihoods = new double[numLikelihoods];
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele);
ArrayList<Allele> aList = new ArrayList<Allele>();
aList.add(refAllele);
aList.add(altAllele);
double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ;
int myLikelihoodsIndex = 0;
for ( int i = 0; i <= numAltAlleles; i++ ) {
for ( int j = i; j <= numAltAlleles; j++ ) {
myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()];
}
}
// normalize in log space so that max element is zero.
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
aList, MathUtils.normalizeFromLog10(dlike, false, true), getFilteredDepth(pileup)));
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
}
return refAllele;
return builder.genotypes(genotypes).make();
}
protected void initializeBestAlternateAllele(byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
private int countSetBits(boolean[] array) {
int counter = 0;
for ( int i = 0; i < array.length; i++ ) {
if ( array[i] )
counter++;
}
return counter;
}
// fills in the allelesToUse array
protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
int[] qualCounts = new int[4];
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
@ -139,7 +179,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup();
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() || (! p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE ))
if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) )
continue;
final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
@ -149,17 +189,31 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
}
}
// set the non-ref base with maximum quality score sum
int maxCount = 0;
bestAlternateAllele = null;
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] > maxCount ) {
maxCount = qualCounts[index];
bestAlternateAllele = altAllele;
if ( ALLOW_MULTIPLE_ALLELES ) {
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) {
allelesToUse[index] = true;
}
}
} else {
// set the non-ref base which has the maximum quality score sum
int maxCount = 0;
int indexOfMax = 0;
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] > maxCount ) {
maxCount = qualCounts[index];
indexOfMax = index;
}
}
if ( maxCount > 0 )
allelesToUse[indexOfMax] = true;
}
}

View File

@ -153,6 +153,18 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
@Hidden
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow multiple alleles in discovery", required = false)
public boolean MULTI_ALLELIC = false;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive.
*/
@Hidden
@Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 5;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
@ -176,10 +188,12 @@ public class UnifiedArgumentCollection {
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
uac.alleles = alleles;
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION;
uac.MULTI_ALLELIC = MULTI_ALLELIC;
return uac;
}

View File

@ -59,6 +59,9 @@ public class UnifiedGenotyperEngine {
EMIT_ALL_SITES
}
protected static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
// the unified argument collection
private final UnifiedArgumentCollection UAC;
public UnifiedArgumentCollection getUAC() { return UAC; }
@ -73,11 +76,12 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
private final double[][] log10AlleleFrequencyPriorsSNPs;
private final double[][] log10AlleleFrequencyPriorsIndels;
// the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
private ThreadLocal<double[][]> log10AlleleFrequencyLikelihoods = new ThreadLocal<double[][]>();
private ThreadLocal<double[][]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[][]>();
// the priors object
private final GenotypePriors genotypePriorsSNPs;
@ -96,6 +100,7 @@ public class UnifiedGenotyperEngine {
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine;
@ -113,6 +118,7 @@ public class UnifiedGenotyperEngine {
@Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"})
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF;
genomeLocParser = toolkit.getGenomeLocParser();
this.samples = new TreeSet<String>(samples);
// note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ
this.UAC = UAC.clone();
@ -122,10 +128,10 @@ public class UnifiedGenotyperEngine {
this.annotationEngine = engine;
N = 2 * this.samples.size();
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, GenotypeLikelihoodsCalculationModel.Model.INDEL);
log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
@ -152,7 +158,6 @@ public class UnifiedGenotyperEngine {
}
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model);
if ( vc == null )
return null;
@ -214,14 +219,7 @@ public class UnifiedGenotyperEngine {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
Map<String, MultiallelicGenotypeLikelihoods> GLs = new HashMap<String, MultiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
if ( refAllele != null )
return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
else
return null;
return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
}
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
@ -256,136 +254,170 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vc, false);
}
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, MultiallelicGenotypeLikelihoods> GLs) {
// no-call everyone for now
List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
Set<Allele> alleles = new LinkedHashSet<Allele>();
alleles.add(refAllele);
boolean addedAltAlleles = false;
GenotypesContext genotypes = GenotypesContext.create();
for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( !addedAltAlleles ) {
addedAltAlleles = true;
// ordering important to maintain consistency
for (Allele a: GL.getAlleles()) {
alleles.add(a);
}
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
//GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(GL.getSample(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(alleles, refAllele, loc);
return new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles).genotypes(genotypes).referenceBaseForIndel(refContext.getBase()).make();
public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
return calculateGenotypes(null, null, null, null, vc, model);
}
// private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[N+1]);
log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) {
logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument");
return null;
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 )
if ( vc.getNSamples() == 0 ) {
if ( limitedContext )
return null;
return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ?
estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), false, 1.0) :
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
// which alternate allele has the highest MLE AC?
int indexOfHighestAlt = -1;
int alleleCountOfHighestAlt = -1;
// determine which alternate alleles have AF>0
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]);
// if the most likely AC is not 0, then this is a good alternate allele to use
if ( indexOfBestAC != 0 ) {
altAllelesToUse[i] = true;
bestGuessIsRef = false;
}
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
altAllelesToUse[i] = true;
}
// keep track of the "best" alternate allele to use
if ( indexOfBestAC > alleleCountOfHighestAlt) {
alleleCountOfHighestAlt = indexOfBestAC;
indexOfHighestAlt = i;
}
}
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
// TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]);
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += log10AlleleFrequencyPosteriors.get()[i];
sum += log10AlleleFrequencyPosteriors.get()[0][i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) {
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
}
// strip out any alleles that aren't going to be used in the VariantContext
List<Allele> myAlleles;
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
myAlleles.add(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
if ( altAllelesToUse[i] )
myAlleles.add(vc.getAlternateAllele(i));
}
} else {
// use all of the alleles if we are given them by the user
myAlleles = vc.getAlleles();
}
// start constructing the resulting VC
GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) )
builder.filters(filter);
if ( !limitedContext )
builder.referenceBaseForIndel(refContext.getBase());
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse);
// print out stats if we have a writer
if ( verboseWriter != null )
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
// if the site was downsampled, record that fact
if ( rawContext.hasPileupBeenDownsampled() )
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) {
if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) {
//final boolean DEBUG_SLOD = false;
// the overall lod
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -401,26 +433,12 @@ public class UnifiedGenotyperEngine {
attributes.put("SB", strandScore);
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
Set<Allele> myAlleles = new HashSet<Allele>(vc.getAlleles());
// strip out the alternate allele if it's a ref call
if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(1);
myAlleles.add(vc.getReference());
}
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
// finish constructing the resulting VC
builder.genotypes(genotypes);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
builder.attributes(attributes);
builder.referenceBaseForIndel(refContext.getBase());
VariantContext vcCall = builder.make();
if ( annotationEngine != null ) {
if ( annotationEngine != null && !limitedContext ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
ReadBackedPileup pileup = null;
if (rawContext.hasExtendedEventPileup())
@ -435,119 +453,6 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
// A barebones entry point to the exact model when there is no tracker or stratified contexts available -- only GLs
public VariantCallContext calculateGenotypes(final VariantContext vc, final GenomeLoc loc, final GenotypeLikelihoodsCalculationModel.Model model) {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 )
return null;
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += log10AlleleFrequencyPosteriors.get()[i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return null;
}
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
Set<Allele> myAlleles = new HashSet<Allele>(vc.getAlleles());
// strip out the alternate allele if it's a ref call
if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(1);
myAlleles.add(vc.getReference());
}
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
builder.genotypes(genotypes);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
builder.attributes(attributes);
builder.referenceBaseForIndel(vc.getReferenceBaseForIndel());
return new VariantCallContext(builder.make(), confidentlyCalled(phredScaledConfidence, PofF));
}
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length
boolean isSNP = true, hasNullAltAllele = false;
for (Allele a : alleles){
if (a.length() != 1) {
isSNP = false;
break;
}
}
for (Allele a : alleles){
if (a.isNull()) {
hasNullAltAllele = true;
break;
}
}
// standard deletion: ref allele length = del length. endLoc = startLoc + refAllele.length(), alt allele = null
// standard insertion: ref allele length = 0, endLos = startLoc
// mixed: want end loc = start Loc for case {A*,AT,T} but say {ATG*,A,T} : want then end loc = start loc + refAllele.length
// So, in general, end loc = startLoc + refAllele.length, except in complex substitutions where it's one less
//
// todo - this is unnecessarily complicated and is so just because of Tribble's arbitrary vc conventions, should be cleaner/simpler,
// the whole vc processing infrastructure seems too brittle and riddled with special case handling
int endLoc = loc.getStart();
if ( !isSNP) {
endLoc += refAllele.length();
if(!hasNullAltAllele)
endLoc--;
}
return endLoc;
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
Map<String, AlignmentContext> stratifiedContexts = null;
@ -604,9 +509,12 @@ public class UnifiedGenotyperEngine {
return stratifiedContexts;
}
protected static void clearAFarray(double[] AFs) {
for ( int i = 0; i < AFs.length; i++ )
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
protected static void clearAFarray(double[][] AFs) {
for ( int i = 0; i < AFs.length; i++ ) {
for ( int j = 0; j < AFs[i].length; j++ ) {
AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
}
private final static double[] binomialProbabilityDepthCache = new double[10000];
@ -676,7 +584,7 @@ public class UnifiedGenotyperEngine {
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
AFline.append("0.00000000\t");
else
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
@ -689,8 +597,8 @@ public class UnifiedGenotyperEngine {
verboseWriter.println();
}
protected boolean passesEmitThreshold(double conf, int bestAFguess) {
return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) {
return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
}
protected boolean passesCallThreshold(double conf) {
@ -744,27 +652,25 @@ public class UnifiedGenotyperEngine {
return null;
}
protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) {
// calculate the allele frequency priors for 1-N
double sum = 0.0;
double heterozygosity;
protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) {
if (model == GenotypeLikelihoodsCalculationModel.Model.INDEL)
heterozygosity = UAC.INDEL_HETEROZYGOSITY;
else
heterozygosity = UAC.heterozygosity;
for (int i = 1; i <= N; i++) {
double value = heterozygosity / (double)i;
priors[i] = Math.log10(value);
sum += value;
// the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i
for (int alleles = 1; alleles <= priors.length; alleles++) {
double sum = 0.0;
// for each i
for (int i = 1; i <= N; i++) {
double value = Math.pow(theta, alleles) / (double)i;
priors[alleles-1][i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[alleles-1][0] = Math.log10(1.0 - sum);
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
}
protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
switch( model ) {
case SNP:
return log10AlleleFrequencyPriorsSNPs;
@ -837,4 +743,88 @@ public class UnifiedGenotyperEngine {
return vc;
}
/**
* @param vc variant context with genotype likelihoods
* @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use
*
* @return genotypes
*/
public GenotypesContext assignGenotypes(VariantContext vc,
boolean[] allelesToUse) {
final GenotypesContext GLs = vc.getGenotypes();
final List<String> sampleIndices = GLs.getSampleNamesOrderedByName();
final GenotypesContext calls = GenotypesContext.create();
for ( int k = GLs.size() - 1; k >= 0; k-- ) {
final String sample = sampleIndices.get(k);
final Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
final double[] likelihoods = g.getLikelihoods().getAsVector();
// if there is no mass on the likelihoods, then just no-call the sample
if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) {
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
continue;
}
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
final int numAltAlleles = allelesToUse.length;
// start with the assumption that the ideal genotype is homozygous reference
Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference();
double maxLikelihoodSeen = likelihoods[0];
int indexOfMax = 0;
// keep track of some state
Allele firstAllele = vc.getReference();
int subtractor = numAltAlleles + 1;
int subtractionsMade = 0;
for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) {
if ( PLindex == subtractor ) {
firstAllele = vc.getAlternateAllele(subtractionsMade);
PLindex -= subtractor;
subtractor--;
subtractionsMade++;
// we can skip this allele if it's not usable
if ( !allelesToUse[subtractionsMade-1] ) {
i += subtractor - 1;
PLindex += subtractor - 1;
continue;
}
}
// we don't care about the entry if we've already seen better
if ( likelihoods[i] <= maxLikelihoodSeen )
continue;
// if it's usable then update the alleles
int alleleIndex = subtractionsMade + PLindex - 1;
if ( allelesToUse[alleleIndex] ) {
maxAllele1 = firstAllele;
maxAllele2 = vc.getAlternateAllele(alleleIndex);
maxLikelihoodSeen = likelihoods[i];
indexOfMax = i;
}
}
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
myAlleles.add(maxAllele1);
myAlleles.add(maxAllele2);
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods);
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
}
return calls;
}
}

View File

@ -320,17 +320,17 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
if(transmissionProb != NO_TRANSMISSION_PROB)
phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
//Handle null, missing and unavailable genotypes
//Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
//genotype so it is safe to return the original genotype in this case.
//In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType()))
return genotype;
//Handle null, missing and unavailable genotypes
//Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
//genotype so it is safe to return the original genotype in this case.
//In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType()))
return genotype;
//Add the transmission probability
Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
genotypeAttributes.putAll(genotype.getAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB)
//Add the transmission probability
Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
genotypeAttributes.putAll(genotype.getAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB)
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
ArrayList<Allele> phasedAlleles = new ArrayList<Allele>(2);

View File

@ -164,13 +164,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
protected double MIN_PHASE_QUALITY = 10.0;
/**
* This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false)
protected String FAMILY_STRUCTURE;
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
@Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
@ -561,8 +555,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; }
public String getFamilyStructure() { return FAMILY_STRUCTURE; }
public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; }
public TreeSet<VariantStratifier> getStratificationObjects() { return stratificationObjects; }

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -7,9 +9,11 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
/**
* Mendelian violation detection and counting
@ -40,12 +44,25 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator")
public class MendelianViolationEvaluator extends VariantEvaluator {
@DataPoint(description = "Number of mendelian variants found")
@DataPoint(description = "Number of variants found with at least one family having genotypes")
long nVariants;
@DataPoint(description = "Number of variants found with no family having genotypes -- these sites do not count in the nNoCall")
long nSkipped;
@DataPoint(description="Number of variants x families called (no missing genotype or lowqual)")
long nFamCalled;
@DataPoint(description="Number of variants x families called (no missing genotype or lowqual) that contain at least one var allele.")
long nVarFamCalled;
@DataPoint(description="Number of variants x families discarded as low quality")
long nLowQual;
@DataPoint(description="Number of variants x families discarded as no call")
long nNoCall;
@DataPoint(description="Number of loci with mendelian violations")
long nLociViolations;
@DataPoint(description = "Number of mendelian violations found")
long nViolations;
@DataPoint(description = "number of child hom ref calls where the parent was hom variant")
/*@DataPoint(description = "number of child hom ref calls where the parent was hom variant")
long KidHomRef_ParentHomVar;
@DataPoint(description = "number of child het calls where the parent was hom ref")
long KidHet_ParentsHomRef;
@ -53,11 +70,65 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
long KidHet_ParentsHomVar;
@DataPoint(description = "number of child hom variant calls where the parent was hom ref")
long KidHomVar_ParentHomRef;
*/
@DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HOM_VAR")
long mvRefRef_Var;
@DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HET")
long mvRefRef_Het;
@DataPoint(description="Number of mendelian violations of the type HOM_REF/HET -> HOM_VAR")
long mvRefHet_Var;
@DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_VAR")
long mvRefVar_Var;
@DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_REF")
long mvRefVar_Ref;
@DataPoint(description="Number of mendelian violations of the type HOM_VAR/HET -> HOM_REF")
long mvVarHet_Ref;
@DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HOM_REF")
long mvVarVar_Ref;
@DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HET")
long mvVarVar_Het;
/*@DataPoint(description ="Number of inherited var alleles from het parents")
long nInheritedVar;
@DataPoint(description ="Number of inherited ref alleles from het parents")
long nInheritedRef;*/
@DataPoint(description="Number of HomRef/HomRef/HomRef trios")
long HomRefHomRef_HomRef;
@DataPoint(description="Number of Het/Het/Het trios")
long HetHet_Het;
@DataPoint(description="Number of Het/Het/HomRef trios")
long HetHet_HomRef;
@DataPoint(description="Number of Het/Het/HomVar trios")
long HetHet_HomVar;
@DataPoint(description="Number of HomVar/HomVar/HomVar trios")
long HomVarHomVar_HomVar;
@DataPoint(description="Number of HomRef/HomVar/Het trios")
long HomRefHomVAR_Het;
@DataPoint(description="Number of ref alleles inherited from het/het parents")
long HetHet_inheritedRef;
@DataPoint(description="Number of var alleles inherited from het/het parents")
long HetHet_inheritedVar;
@DataPoint(description="Number of ref alleles inherited from homRef/het parents")
long HomRefHet_inheritedRef;
@DataPoint(description="Number of var alleles inherited from homRef/het parents")
long HomRefHet_inheritedVar;
@DataPoint(description="Number of ref alleles inherited from homVar/het parents")
long HomVarHet_inheritedRef;
@DataPoint(description="Number of var alleles inherited from homVar/het parents")
long HomVarHet_inheritedVar;
MendelianViolation mv;
PrintStream mvFile;
Map<String,Set<Sample>> families;
public void initialize(VariantEvalWalker walker) {
mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold());
//Changed by Laurent Francioli - 2011-06-07
//mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold());
mv = new MendelianViolation(walker.getMendelianViolationQualThreshold(),false);
families = walker.getSampleDB().getFamilies();
}
public boolean enabled() {
@ -75,110 +146,48 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci
if (mv.setAlleles(vc)) {
if(mv.countViolations(families,vc)>0){
nLociViolations++;
nViolations += mv.getViolationsCount();
mvRefRef_Var += mv.getParentsRefRefChildVar();
mvRefRef_Het += mv.getParentsRefRefChildHet();
mvRefHet_Var += mv.getParentsRefHetChildVar();
mvRefVar_Var += mv.getParentsRefVarChildVar();
mvRefVar_Ref += mv.getParentsRefVarChildRef();
mvVarHet_Ref += mv.getParentsVarHetChildRef();
mvVarVar_Ref += mv.getParentsVarVarChildRef();
mvVarVar_Het += mv.getParentsVarVarChildHet();
}
HomRefHomRef_HomRef += mv.getRefRefRef();
HetHet_Het += mv.getHetHetHet();
HetHet_HomRef += mv.getHetHetHomRef();
HetHet_HomVar += mv.getHetHetHomVar();
HomVarHomVar_HomVar += mv.getVarVarVar();
HomRefHomVAR_Het += mv.getRefVarHet();
HetHet_inheritedRef += mv.getParentsHetHetInheritedRef();
HetHet_inheritedVar += mv.getParentsHetHetInheritedVar();
HomRefHet_inheritedRef += mv.getParentsRefHetInheritedRef();
HomRefHet_inheritedVar += mv.getParentsRefHetInheritedVar();
HomVarHet_inheritedRef += mv.getParentsVarHetInheritedRef();
HomVarHet_inheritedVar += mv.getParentsVarHetInheritedVar();
if(mv.getFamilyCalledCount()>0){
nVariants++;
Genotype momG = vc.getGenotype(mv.getSampleMom());
Genotype dadG = vc.getGenotype(mv.getSampleDad());
Genotype childG = vc.getGenotype(mv.getSampleChild());
if (mv.isViolation()) {
nViolations++;
String label;
if (childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar())) {
label = "KidHomRef_ParentHomVar";
KidHomRef_ParentHomVar++;
} else if (childG.isHet() && (momG.isHomRef() && dadG.isHomRef())) {
label = "KidHet_ParentsHomRef";
KidHet_ParentsHomRef++;
} else if (childG.isHet() && (momG.isHomVar() && dadG.isHomVar())) {
label = "KidHet_ParentsHomVar";
KidHet_ParentsHomVar++;
} else if (childG.isHomVar() && (momG.isHomRef() || dadG.isHomRef())) {
label = "KidHomVar_ParentHomRef";
KidHomVar_ParentHomRef++;
} else {
throw new ReviewedStingException("BUG: unexpected child genotype class " + childG);
}
return "MendelViolation=" + label;
}
nFamCalled += mv.getFamilyCalledCount();
nLowQual += mv.getFamilyLowQualsCount();
nNoCall += mv.getFamilyNoCallCount();
nVarFamCalled += mv.getVarFamilyCalledCount();
}
}
return null; // we don't capture any intersting sites
}
/*
private double getQThreshold() {
//return getVEWalker().MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred
return mendelianViolationQualThreshold / 10; // we aren't 10x scaled in the GATK a la phred
//return 0.0;
}
TrioStructure trio;
double mendelianViolationQualThreshold;
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
public static class TrioStructure {
public String mom, dad, child;
}
public static TrioStructure parseTrioDescription(String family) {
Matcher m = FAMILY_PATTERN.matcher(family);
if (m.matches()) {
TrioStructure trio = new TrioStructure();
//System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
trio.mom = m.group(1);
trio.dad = m.group(2);
trio.child = m.group(3);
return trio;
} else {
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
}
}
public void initialize(VariantEvalWalker walker) {
trio = parseTrioDescription(walker.getFamilyStructure());
mendelianViolationQualThreshold = walker.getMendelianViolationQualThreshold();
}
private boolean includeGenotype(Genotype g) {
return g.getLog10PError() > getQThreshold() && g.isCalled();
}
public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) {
return isViolation(vc, momG.getAlleles(), dadG.getAlleles(), childG.getAlleles());
}
public static boolean isViolation(VariantContext vc, TrioStructure trio ) {
return isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child) );
}
public static boolean isViolation(VariantContext vc, List<Allele> momA, List<Allele> dadA, List<Allele> childA) {
//VariantContext momVC = vc.subContextFromGenotypes(momG);
//VariantContext dadVC = vc.subContextFromGenotypes(dadG);
int i = 0;
Genotype childG = new Genotype("kidG", childA);
for (Allele momAllele : momA) {
for (Allele dadAllele : dadA) {
if (momAllele.isCalled() && dadAllele.isCalled()) {
Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele));
if (childG.sameGenotype(possibleChild)) {
return false;
}
}
else{
nSkipped++;
}
return null;
}
return true;
return null; // we don't capture any interesting sites
}
*/
}

View File

@ -26,9 +26,10 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -41,7 +42,6 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
@ -180,7 +180,7 @@ import java.util.*;
* </pre>
*
*/
public class SelectVariants extends RodWalker<Integer, Integer> {
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
@ -282,6 +282,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
@Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false)
private double fractionGenotypes = 0;
/**
* This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
* When specified one or more times, a particular type of variant is selected.
@ -325,7 +328,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean DISCORDANCE_ONLY = false;
private boolean CONCORDANCE_ONLY = false;
private Set<MendelianViolation> mvSet = new HashSet<MendelianViolation>();
private MendelianViolation mv;
/* variables used by the SELECT RANDOM modules */
@ -344,6 +347,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private PrintStream outMVFileStream = null;
//Random number generator for the genotypes to remove
private Random randomGenotypes = new Random();
/**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
@ -380,8 +385,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
if (TYPES_TO_INCLUDE.isEmpty()) {
@ -421,29 +424,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());
if (MENDELIAN_VIOLATIONS) {
if ( FAMILY_STRUCTURE_FILE != null) {
try {
for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) {
MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom()))
mvSet.add(mv);
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e);
}
if (outMVFile != null)
try {
outMVFileStream = new PrintStream(outMVFile);
}
catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); }
}
else
mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
}
else if (!FAMILY_STRUCTURE.isEmpty()) {
mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD));
MENDELIAN_VIOLATIONS = true;
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
}
SELECT_RANDOM_NUMBER = numRandom > 0;
@ -479,26 +460,26 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
}
for (VariantContext vc : vcs) {
if (MENDELIAN_VIOLATIONS) {
boolean foundMV = false;
for (MendelianViolation mv : mvSet) {
if (mv.isViolation(vc)) {
foundMV = true;
//System.out.println(vc.toString());
if (outMVFile != null)
outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " +
if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1)
break;
if (outMVFile != null){
for( String familyId : mv.getViolationFamilies()){
for(Sample sample : this.getSampleDB().getFamily(familyId)){
if(sample.getParents().size() > 0){
outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " +
"childG=%s childGL=%s\n",vc.getChr(), vc.getStart(),
vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)),
mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(),
vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(),
vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() );
sample.getMaternalID(), sample.getPaternalID(), sample.getID(),
vc.getGenotype(sample.getMaternalID()).toBriefString(), vc.getGenotype(sample.getMaternalID()).getLikelihoods().getAsString(),
vc.getGenotype(sample.getPaternalID()).toBriefString(), vc.getGenotype(sample.getPaternalID()).getLikelihoods().getAsString(),
vc.getGenotype(sample.getID()).toBriefString(),vc.getGenotype(sample.getID()).getLikelihoods().getAsString() );
}
}
}
if (!foundMV)
break;
}
if (DISCORDANCE_ONLY) {
Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation());
if (!isDiscordant(vc, compVCs))
@ -629,6 +610,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Override
public Integer reduce(Integer value, Integer sum) { return value + sum; }
@Override
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public void onTraversalDone(Integer result) {
logger.info(result + " records processed.");
@ -657,9 +643,31 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles());
VariantContextBuilder builder = new VariantContextBuilder(sub);
GenotypesContext newGC = sub.getGenotypes();
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
if ( vc.getAlleles().size() != sub.getAlleles().size() )
builder.genotypes(VariantContextUtils.stripPLs(vc.getGenotypes()));
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
//Remove a fraction of the genotypes if needed
if(fractionGenotypes>0){
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
alleles.add(Allele.create((byte)'.'));
alleles.add(Allele.create((byte)'.'));
genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap<String, Object>(),false));
}
else{
genotypes.add(genotype);
}
}
newGC = GenotypesContext.create(genotypes);
}
builder.genotypes(newGC);
int depth = 0;
for (String sample : sub.getSampleNames()) {

View File

@ -1,147 +1,394 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.*;
/**
* User: carneiro
* User: carneiro / lfran
* Date: 3/9/11
* Time: 12:38 PM
*
* Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways:
* - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while
* walking over the variants
* - Or using the static methods to directly get information about mendelian violation in a family at a given locus
*
*/
public class MendelianViolation {
String sampleMom;
String sampleDad;
String sampleChild;
//List of families with violations
private List<String> violationFamilies;
List allelesMom;
List allelesDad;
List allelesChild;
//Call information
private int nocall = 0;
private int familyCalled = 0;
private int varFamilyCalled = 0;
private int lowQual = 0;
double minGenotypeQuality;
private boolean allCalledOnly = true;
//Stores occurrences of inheritance
private EnumMap<Genotype.Type, EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> inheritance;
private int violations_total=0;
private double minGenotypeQuality;
private boolean abortOnSampleNotFound;
//Number of families with genotype information for all members
public int getFamilyCalledCount(){
return familyCalled;
}
//Number of families with genotype information for all members
public int getVarFamilyCalledCount(){
return varFamilyCalled;
}
//Number of families missing genotypes for one or more of their members
public int getFamilyNoCallCount(){
return nocall;
}
//Number of families with genotypes below the set quality threshold
public int getFamilyLowQualsCount(){
return lowQual;
}
public int getViolationsCount(){
return violations_total;
}
//Count of alt alleles inherited from het parents (no violation)
public int getParentHetInheritedVar(){
return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar();
}
//Count of ref alleles inherited from het parents (no violation)
public int getParentHetInheritedRef(){
return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef();
}
//Count of HomRef/HomRef/HomRef trios
public int getRefRefRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
}
//Count of HomVar/HomVar/HomVar trios
public int getVarVarVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR);
}
//Count of HomRef/HomVar/Het trios
public int getRefVarHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) +
inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
}
//Count of Het/Het/Het trios
public int getHetHetHet(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET);
}
//Count of Het/Het/HomRef trios
public int getHetHetHomRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
}
//Count of Het/Het/HomVar trios
public int getHetHetHomVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR);
}
//Count of ref alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
//return parentsHetHet_childRef;
}
//Count of var alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR);
//return parentsHetHet_childVar;
}
//Count of ref alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
//return parentsHomRefHet_childRef;
}
//Count of var alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
//return parentsHomRefHet_childVar;
}
//Count of ref alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET);
//return parentsHomVarHet_childRef;
}
//Count of var alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR);
//return parentsHomVarHet_childVar;
}
//Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR
public int getParentsRefRefChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_REF -> HET
public int getParentsRefRefChildHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
}
//Count of violations of the type HOM_REF/HET -> HOM_VAR
public int getParentsRefHetChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR
public int getParentsRefVarChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF
public int getParentsRefVarChildRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
}
//Count of violations of the type HOM_VAR/HET -> HOM_REF
public int getParentsVarHetChildRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
}
//Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF
public int getParentsVarVarChildRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF);
}
//Count of violations of the type HOM_VAR/HOM_VAR -> HET
public int getParentsVarVarChildHet(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET);
}
//Count of violations of the type HOM_VAR/? -> HOM_REF
public int getParentVarChildRef(){
return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef();
}
//Count of violations of the type HOM_REF/? -> HOM_VAR
public int getParentRefChildVar(){
return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar();
}
//Returns a String containing all trios where a Mendelian violation was observed.
//The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..."
public String getViolationFamiliesString(){
if(violationFamilies.isEmpty())
return "";
Iterator<String> it = violationFamilies.iterator();
String violationFams = it.next();
while(it.hasNext()){
violationFams += ","+it.next();
}
return violationFams;
}
public List<String> getViolationFamilies(){
return violationFamilies;
}
static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 };
static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 };
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
public String getSampleMom() {
return sampleMom;
}
public String getSampleDad() {
return sampleDad;
}
public String getSampleChild() {
return sampleChild;
}
public double getMinGenotypeQuality() {
return minGenotypeQuality;
}
/**
*
* @param sampleMomP - sample name of mom
* @param sampleDadP - sample name of dad
* @param sampleChildP - sample name of child
*/
public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) {
sampleMom = sampleMomP;
sampleDad = sampleDadP;
sampleChild = sampleChildP;
}
/**
*
* @param family - the sample names string "mom+dad=child"
/**
* Constructor
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(String family, double minGenotypeQualityP) {
minGenotypeQuality = minGenotypeQualityP;
Matcher m = FAMILY_PATTERN.matcher(family);
if (m.matches()) {
sampleMom = m.group(1);
sampleDad = m.group(2);
sampleChild = m.group(3);
}
else
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
}
/**
* An alternative to the more general constructor if you want to get the Sample information from the engine yourself.
* @param sample - the sample object extracted from the sample metadata YAML file given to the engine.
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to assess mendelian violation
*/
public MendelianViolation(Sample sample, double minGenotypeQualityP) {
sampleMom = sample.getMother().getID();
sampleDad = sample.getFather().getID();
sampleChild = sample.getID();
minGenotypeQuality = minGenotypeQualityP;
}
/**
* This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to
* isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site
* before evaluating it for mendelian violation, you can call setAlleles and then isViolation().
* @param vc - the variant context to extract the genotypes and alleles for mom, dad and child.
* @return false if couldn't find the genotypes or context has empty alleles. True otherwise.
*/
public boolean setAlleles (VariantContext vc)
{
Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom);
Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad);
Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild);
if (gMom == null || gDad == null || gChild == null)
throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild));
if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() ||
gMom.getPhredScaledQual() < minGenotypeQuality ||
gDad.getPhredScaledQual() < minGenotypeQuality ||
gChild.getPhredScaledQual() < minGenotypeQuality ) {
return false;
}
allelesMom = gMom.getAlleles();
allelesDad = gDad.getAlleles();
allelesChild = gChild.getAlleles();
return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty();
}
/**
*
*/
public MendelianViolation(double minGenotypeQualityP) {
this(minGenotypeQualityP,true);
}
/**
* Constructor
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
* @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored.
*/
public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) {
minGenotypeQuality = minGenotypeQualityP;
this.abortOnSampleNotFound = abortOnSampleNotFound;
violationFamilies = new ArrayList<String>();
createInheritanceMap();
}
/**
* Constructor
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
* @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored.
* @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too.
*/
public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) {
minGenotypeQuality = minGenotypeQualityP;
this.abortOnSampleNotFound = abortOnSampleNotFound;
violationFamilies = new ArrayList<String>();
createInheritanceMap();
allCalledOnly = completeTriosOnly;
}
/**
* @param families the families to be checked for Mendelian violations
* @param vc the variant context to extract the genotypes and alleles for mom, dad and child.
* @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
*
*/
public boolean isViolation(VariantContext vc)
{
return setAlleles(vc) && isViolation();
}
/**
* @return whether or not there is a mendelian violation at the site.
*/
public boolean isViolation() {
if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) ||
allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0)))
return false;
return true;
public int countViolations(Map<String, Set<Sample>> families, VariantContext vc){
//Reset counts
nocall = 0;
lowQual = 0;
familyCalled = 0;
varFamilyCalled = 0;
violations_total=0;
violationFamilies.clear();
clearInheritanceMap();
for(Set<Sample> family : families.values()){
Iterator<Sample> sampleIterator = family.iterator();
Sample sample;
while(sampleIterator.hasNext()){
sample = sampleIterator.next();
if(sample.getParents().size() > 0)
updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc);
}
}
return violations_total;
}
public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){
//Reset counts
nocall = 0;
lowQual = 0;
familyCalled = 0;
varFamilyCalled = 0;
violations_total=0;
violationFamilies.clear();
clearInheritanceMap();
updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc);
return violations_total>0;
}
private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){
int count;
Genotype gMom = vc.getGenotype(motherId);
Genotype gDad = vc.getGenotype(fatherId);
Genotype gChild = vc.getGenotype(childId);
if (gMom == null || gDad == null || gChild == null){
if(abortOnSampleNotFound)
throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId));
else
return;
}
//Count No calls
if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){
nocall++;
}
else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){
nocall++;
}
//Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned
else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality ||
gDad.getPhredScaledQual() < minGenotypeQuality ||
gChild.getPhredScaledQual() < minGenotypeQuality )) {
lowQual++;
}
else{
//Count all families per loci called
familyCalled++;
//If the family is all homref, not too interesting
if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef()))
{
varFamilyCalled++;
if(isViolation(gMom, gDad, gChild)){
violationFamilies.add(familyId);
violations_total++;
}
}
count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType());
inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1);
}
}
private boolean isViolation(Genotype gMom, Genotype gDad, Genotype gChild) {
//1 parent is no "call
if(!gMom.isCalled()){
return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef());
}
else if(!gDad.isCalled()){
return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef());
}
//Both parents have genotype information
return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) ||
gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0)));
}
private void createInheritanceMap(){
inheritance = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
for(Genotype.Type mType : Genotype.Type.values()){
inheritance.put(mType, new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
for(Genotype.Type dType : Genotype.Type.values()){
inheritance.get(mType).put(dType, new EnumMap<Genotype.Type,Integer>(Genotype.Type.class));
for(Genotype.Type cType : Genotype.Type.values()){
inheritance.get(mType).get(dType).put(cType, 0);
}
}
}
}
private void clearInheritanceMap(){
for(Genotype.Type mType : Genotype.Type.values()){
for(Genotype.Type dType : Genotype.Type.values()){
for(Genotype.Type cType : Genotype.Type.values()){
inheritance.get(mType).get(dType).put(cType, 0);
}
}
}
}
/**
* @return the likelihood ratio for a mendelian violation
*/
public double violationLikelihoodRatio(VariantContext vc) {
public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) {
double[] logLikAssignments = new double[27];
// the matrix to set up is
// MOM DAD CHILD
@ -152,9 +399,9 @@ public class MendelianViolation {
// AA AB | AB
// |- BB
// etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs
double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector();
double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector();
double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector();
double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector();
double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector();
double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector();
int offset = 0;
for ( int oMom = 0; oMom < 3; oMom++ ) {
for ( int oDad = 0; oDad < 3; oDad++ ) {

View File

@ -27,11 +27,42 @@ public class SampleDBUnitTest extends BaseTest {
new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED),
new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)));
private static final Set<Sample> testPEDFamilyF2 = new HashSet<Sample>(Arrays.asList(
new Sample("s2", "fam2", "d2", "m2", Gender.FEMALE, Affection.AFFECTED),
new Sample("d2", "fam2", null, null, Gender.MALE, Affection.UNKNOWN),
new Sample("m2", "fam2", null, null, Gender.FEMALE, Affection.UNKNOWN)
));
private static final Set<Sample> testPEDFamilyF3 = new HashSet<Sample>(Arrays.asList(
new Sample("s1", "fam3", "d1", "m1", Gender.FEMALE, Affection.AFFECTED),
new Sample("d1", "fam3", null, null, Gender.FEMALE, Affection.UNKNOWN),
new Sample("m1", "fam3", null, null, Gender.FEMALE, Affection.UNKNOWN)
));
private static final Set<Sample> testSAMSamples = new HashSet<Sample>(Arrays.asList(
new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN),
new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN),
new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN)));
private static final HashMap<String, Set<Sample>> testGetFamilies = new HashMap<String,Set<Sample>>();
static {
testGetFamilies.put("fam1", testPEDSamples);
testGetFamilies.put("fam2", testPEDFamilyF2);
testGetFamilies.put("fam3", testPEDFamilyF3);
}
private static final Set<Sample> testKidsWithParentsFamilies2 = new HashSet<Sample>(Arrays.asList(
new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED),
new Sample("kid3", "fam5", "dad2", "mom2", Gender.MALE, Affection.AFFECTED),
new Sample("kid2", "fam5", "dad2", "mom2", Gender.MALE, Affection.AFFECTED)));
private static final HashSet<String> testGetPartialFamiliesIds = new HashSet<String>(Arrays.asList("kid","s1"));
private static final HashMap<String, Set<Sample>> testGetPartialFamilies = new HashMap<String,Set<Sample>>();
static {
testGetPartialFamilies.put("fam1", new HashSet<Sample>(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED))));
testGetPartialFamilies.put("fam3", new HashSet<Sample>(Arrays.asList(new Sample("s1", "fam3", "d1", "m1", Gender.FEMALE, Affection.AFFECTED))));
}
private static final String testPEDString =
String.format("%s%n%s%n%s",
"fam1 kid dad mom 1 2",
@ -46,6 +77,18 @@ public class SampleDBUnitTest extends BaseTest {
"fam3 s1 d1 m1 2 2",
"fam2 s2 d2 m2 2 2");
private static final String testPEDMultipleFamilies2 =
String.format("%s%n%s%n%s%n%s%n%s%n%s%n%s%n%s%n%s",
"fam1 kid dad mom 1 2",
"fam1 dad 0 0 1 1",
"fam1 mom 0 0 2 2",
"fam4 kid4 dad4 0 1 2",
"fam4 dad4 0 0 1 1",
"fam5 kid2 dad2 mom2 1 2",
"fam5 kid3 dad2 mom2 1 2",
"fam5 dad2 0 0 1 1",
"fam5 mom2 0 0 2 2");
private static final String testPEDStringInconsistentGender =
"fam1 kid 0 0 2 2";
@ -138,6 +181,25 @@ public class SampleDBUnitTest extends BaseTest {
Assert.assertEquals(db.getFamily("fam1"), testPEDSamplesAsSet);
}
@Test()
public void getFamilies(){
builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(db.getFamilies(),testGetFamilies);
Assert.assertEquals(db.getFamilies(null),testGetFamilies);
Assert.assertEquals(db.getFamilies(testGetPartialFamiliesIds),testGetPartialFamilies);
}
@Test()
public void testGetChildrenWithParents()
{
builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies2));
SampleDB db = builder.getFinalSampleDB();
Assert.assertEquals(db.getChildrenWithParents(), testKidsWithParentsFamilies2);
Assert.assertEquals(db.getChildrenWithParents(false), testKidsWithParentsFamilies2);
Assert.assertEquals(db.getChildrenWithParents(true), new HashSet<Sample>(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED))));
}
@Test()
public void loadFamilyIDs() {
builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies));

View File

@ -0,0 +1,104 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3;
static double[][] priors = new double[2][2*numSamples+1]; // flat priors
@BeforeSuite
public void before() {
AA1 = new double[]{0.0, -20.0, -20.0};
AB1 = new double[]{-20.0, 0.0, -20.0};
BB1 = new double[]{-20.0, -20.0, 0.0};
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
String name;
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
super(GetGLsTest.class, name);
GLs = GenotypesContext.create(arg);
this.name = name;
this.numAltAlleles = numAltAlleles;
}
public String toString() {
return String.format("%s input=%s", super.toString(), GLs);
}
}
private static Genotype createGenotype(String name, double[] gls) {
return new Genotype(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), Genotype.NO_LOG10_PERROR, gls);
}
@DataProvider(name = "getGLs")
public Object[][] createGLsData() {
// bi-allelic case
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
// tri-allelic case
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1];
final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1];
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) {
log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]);
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
}

View File

@ -7,7 +7,6 @@ import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
// ********************************************************************************** //
@ -29,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("286f0de92e4ce57986ba861390c6019d"));
Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a"));
executeTest("test MultiSample Pilot1", spec);
}
@ -45,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("d0593483e85a7d815f4c5ee6db284d2a"));
Arrays.asList("43e7a17d95b1a0cf72e669657794d802"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -53,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("3ccce5d909f8f128e496f6841836e5f7"));
Arrays.asList("ae29b9c9aacce8046dc780430540cd62"));
executeTest("test SingleSample Pilot2", spec);
}
@ -63,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864";
private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65";
@Test
public void testCompressedOutput() {
@ -84,7 +83,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "95614280c565ad90f8c000376fef822c";
String md5 = "32a34362dff51d8b73a3335048516d82";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -165,8 +164,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" );
e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" );
e.put( 0.01, "2cb2544739e01f6c08fd820112914317" );
e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -276,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd"));
Arrays.asList("45633d905136c86e9d3f90ce613255e5"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}
@ -286,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
Arrays.asList("f93f8a35b47bcf96594ada55e2312c73"));
Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
}
@ -295,8 +294,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("9be28cb208d8b0314d2bc2696e2fd8d4"));
executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);
Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
}
@Test

View File

@ -291,6 +291,17 @@ public class VariantEvalIntegrationTest extends WalkerTest {
executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec);
}
@Test
public void testVEMendelianViolationEvaluator() {
String vcfFile = "/MendelianViolationEval.vcf";
String pedFile = "/MendelianViolationEval.ped";
WalkerTestSpec spec = new WalkerTestSpec("-T VariantEval -R "+b37KGReference+" --eval " + variantEvalTestDataRoot + vcfFile + " -ped "+ variantEvalTestDataRoot + pedFile +" -noEV -EV MendelianViolationEvaluator -L 1:10109-10315 -o %s -mvq 0 -noST",
1,
Arrays.asList("66e72c887124f40933d32254b2dd44a3"));
executeTestParallel("testVEMendelianViolationEvaluator" + vcfFile, spec);
}
@Test
public void testCompVsEvalAC() {
String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf";

View File

@ -115,4 +115,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
executeTest("testUsingDbsnpName--" + testFile, spec);
}
@Test
public void testParallelization() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
WalkerTestSpec spec;
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"),
1,
Arrays.asList("d18516c1963802e92cb9e425c0b75fd6")
);
executeTest("testParallelization (2 threads)--" + testfile, spec);
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"),
1,
Arrays.asList("d18516c1963802e92cb9e425c0b75fd6")
);
executeTest("testParallelization (4 threads)--" + testfile, spec);
}
}

View File

@ -83,6 +83,10 @@ class DataProcessingPipeline extends QScript {
@Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
var defaultPlatform: String = ""
@Hidden
@Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
var testMode: Boolean = false
/****************************************************************************
* Global Variables
@ -335,6 +339,7 @@ class DataProcessingPipeline extends QScript {
this.known ++= qscript.indels
this.consensusDeterminationModel = cleanModelEnum
this.compress = 0
this.noPGTag = qscript.testMode;
this.scatterCount = nContigs
this.analysisName = queueLogDir + outBam + ".clean"
this.jobName = queueLogDir + outBam + ".clean"
@ -360,6 +365,7 @@ class DataProcessingPipeline extends QScript {
this.out = outBam
if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
else if (qscript.intervals != null) this.intervals :+= qscript.intervals
this.no_pg_tag = qscript.testMode
this.scatterCount = nContigs
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"

View File

@ -47,6 +47,10 @@ class PacbioProcessingPipeline extends QScript {
@Input(shortName="bwastring", required=false)
var bwastring: String = ""
@Hidden
@Input(shortName = "test", fullName = "test_mode", required = false)
var testMode: Boolean = false
val queueLogDir: String = ".qlog/"
def script = {
@ -170,6 +174,7 @@ class PacbioProcessingPipeline extends QScript {
this.input_file :+= inBam
this.recal_file = inRecalFile
this.out = outBam
this.no_pg_tag = testMode
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
@ -177,7 +182,6 @@ class PacbioProcessingPipeline extends QScript {
}
case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {
this.resources = R
this.recal_file = inRecalFile
this.output_dir = outPath
this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates"

View File

@ -0,0 +1,69 @@
package org.broadinstitute.sting.queue.pipeline
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.testng.annotations.Test
import org.broadinstitute.sting.BaseTest
class DataProcessingPipelineTest {
@Test
def testSimpleBAM {
val projectName = "test1"
val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam"
val spec = new PipelineTestSpec
spec.name = "DataProcessingPipeline"
spec.args = Array(
" -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala",
" -R " + BaseTest.testDir + "exampleFASTA.fasta",
" -i " + BaseTest.testDir + "exampleBAM.bam",
" -D " + BaseTest.testDir + "exampleDBSNP.vcf",
" -nv ",
" -test ",
" -p " + projectName).mkString
spec.fileMD5s += testOut -> "1f85e76de760167a77ed1d9ab4da2936"
PipelineTest.executeTest(spec)
}
@Test
def testBWAPEBAM {
val projectName = "test2"
val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam"
val spec = new PipelineTestSpec
spec.name = "DataProcessingPipeline"
spec.args = Array(
" -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala",
" -R " + BaseTest.testDir + "exampleFASTA.fasta",
" -i " + BaseTest.testDir + "exampleBAM.bam",
" -D " + BaseTest.testDir + "exampleDBSNP.vcf",
" -nv ",
" -test ",
" -bwa /home/unix/carneiro/bin/bwa",
" -bwape ",
" -p " + projectName).mkString
spec.fileMD5s += testOut -> "57416a0abdf9524bc92834d466529708"
PipelineTest.executeTest(spec)
}
}

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.queue.pipeline
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.testng.annotations.Test
import org.broadinstitute.sting.BaseTest
class PacbioProcessingPipelineTest {
@Test
def testBAM {
val testOut = "exampleBAM.recal.bam"
val spec = new PipelineTestSpec
spec.name = "pacbioProcessingPipeline"
spec.args = Array(
" -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala",
" -R " + BaseTest.testDir + "exampleFASTA.fasta",
" -i " + BaseTest.testDir + "exampleBAM.bam",
" -blasr ",
" -test ",
" -D " + BaseTest.testDir + "exampleDBSNP.vcf").mkString
spec.fileMD5s += testOut -> "f0adce660b55cb91d5f987f9a145471e"
PipelineTest.executeTest(spec)
}
}

Binary file not shown.

Binary file not shown.

282
public/testdata/exampleDBSNP.vcf vendored 100644
View File

@ -0,0 +1,282 @@
##fileformat=VCFv4.1
##FILTER=<ID=NC,Description="Inconsistent Genotype Submission For At Least One Sample">
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=ASP,Number=0,Type=Flag,Description="Is Assembly specific. This is set if the variant only maps to one assembly">
##INFO=<ID=ASS,Number=0,Type=Flag,Description="In acceptor splice site FxnCode = 73">
##INFO=<ID=CDA,Number=0,Type=Flag,Description="Variation is interrogated in a clinical diagnostic assay">
##INFO=<ID=CFL,Number=0,Type=Flag,Description="Has Assembly conflict. This is for weight 1 and 2 variant that maps to different chromosomes on different assemblies.">
##INFO=<ID=CLN,Number=0,Type=Flag,Description="Variant is Clinical(LSDB,OMIM,TPA,Diagnostic)">
##INFO=<ID=DSS,Number=0,Type=Flag,Description="In donor splice-site FxnCode = 75">
##INFO=<ID=G5,Number=0,Type=Flag,Description=">5% minor allele frequency in 1+ populations">
##INFO=<ID=G5A,Number=0,Type=Flag,Description=">5% minor allele frequency in each and all populations">
##INFO=<ID=GCF,Number=0,Type=Flag,Description="Has Genotype Conflict Same (rs, ind), different genotype. N/N is not included.">
##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar (|)">
##INFO=<ID=GMAF,Number=1,Type=Float,Description="Global Minor Allele Frequency [0, 0.5]; global population is 1000GenomesProject phase 1 genotype data from 629 individuals, released in the 08-04-2010 dataset">
##INFO=<ID=GNO,Number=0,Type=Flag,Description="Genotypes available. The variant has individual genotype (in SubInd table).">
##INFO=<ID=HD,Number=0,Type=Flag,Description="Marker is on high density genotyping kit (50K density or greater). The variant may have phenotype associations present in dbGaP.">
##INFO=<ID=INT,Number=0,Type=Flag,Description="In Intron FxnCode = 6">
##INFO=<ID=KGPROD,Number=0,Type=Flag,Description="1000 Genome production phase">
##INFO=<ID=KGPilot1,Number=0,Type=Flag,Description="1000 Genome discovery(pilot1) 2009">
##INFO=<ID=KGPilot123,Number=0,Type=Flag,Description="1000 Genome discovery all pilots 2010(1,2,3)">
##INFO=<ID=KGVAL,Number=0,Type=Flag,Description="1000 Genome validated by second method">
##INFO=<ID=LSD,Number=0,Type=Flag,Description="Submitted from a locus-specific database">
##INFO=<ID=MTP,Number=0,Type=Flag,Description="Microattribution/third-party annotation(TPA:GWAS,PAGE)">
##INFO=<ID=MUT,Number=0,Type=Flag,Description="Is mutation (journal citation, explicit fact): a low frequency variation that is cited in journal and other reputable sources">
##INFO=<ID=NOC,Number=0,Type=Flag,Description="Contig allele not present in variant allele list. The reference sequence allele at the mapped position is not present in the variant allele list, adjusted for orientation.">
##INFO=<ID=NOV,Number=0,Type=Flag,Description="Rs cluster has non-overlapping allele sets. True when rs set has more than 2 alleles from different submissions and these sets share no alleles in common.">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=NSF,Number=0,Type=Flag,Description="Has non-synonymous frameshift A coding region variation where one allele in the set changes all downstream amino acids. FxnClass = 44">
##INFO=<ID=NSM,Number=0,Type=Flag,Description="Has non-synonymous missense A coding region variation where one allele in the set changes protein peptide. FxnClass = 42">
##INFO=<ID=NSN,Number=0,Type=Flag,Description="Has non-synonymous nonsense A coding region variation where one allele in the set changes to STOP codon (TER). FxnClass = 41">
##INFO=<ID=OM,Number=0,Type=Flag,Description="Has OMIM/OMIA">
##INFO=<ID=OTH,Number=0,Type=Flag,Description="Has other variant with exactly the same set of mapped positions on NCBI refernce assembly.">
##INFO=<ID=PH1,Number=0,Type=Flag,Description="Phase 1 genotyped: filtered, non-redundant">
##INFO=<ID=PH2,Number=0,Type=Flag,Description="Phase 2 genotyped: filtered, non-redundant">
##INFO=<ID=PH3,Number=0,Type=Flag,Description="Phase 3 genotyped: filtered, non-redundant">
##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
##INFO=<ID=PMC,Number=0,Type=Flag,Description="Links exist to PubMed Central article">
##INFO=<ID=R3,Number=0,Type=Flag,Description="In 3' gene region FxnCode = 13">
##INFO=<ID=R5,Number=0,Type=Flag,Description="In 5' gene region FxnCode = 15">
##INFO=<ID=REF,Number=0,Type=Flag,Description="Has reference A coding region variation where one allele in the set is identical to the reference sequence. FxnCode = 8">
##INFO=<ID=RSPOS,Number=1,Type=Integer,Description="Chr position reported in dbSNP">
##INFO=<ID=RV,Number=0,Type=Flag,Description="RS orientation is reversed">
##INFO=<ID=S3D,Number=0,Type=Flag,Description="Has 3D structure - SNP3D table">
##INFO=<ID=SAO,Number=1,Type=Integer,Description="Variant Allele Origin: 0 - unspecified, 1 - Germline, 2 - Somatic, 3 - Both">
##INFO=<ID=SCS,Number=1,Type=Integer,Description="Variant Clinical Significance, 0 - unknown, 1 - untested, 2 - non-pathogenic, 3 - probable-non-pathogenic, 4 - probable-pathogenic, 5 - pathogenic, 6 - drug-response, 7 - histocompatibility, 255 - other">
##INFO=<ID=SLO,Number=0,Type=Flag,Description="Has SubmitterLinkOut - From SNP->SubSNP->Batch.link_out">
##INFO=<ID=SSR,Number=1,Type=Integer,Description="Variant Suspect Reason Code, 0 - unspecified, 1 - Paralog, 2 - byEST, 3 - Para_EST, 4 - oldAlign, 5 - other">
##INFO=<ID=SYN,Number=0,Type=Flag,Description="Has synonymous A coding region variation where one allele in the set does not change the encoded amino acid. FxnCode = 3">
##INFO=<ID=TPA,Number=0,Type=Flag,Description="Provisional Third Party Annotation(TPA) (currently rs from PHARMGKB who will give phenotype data)">
##INFO=<ID=U3,Number=0,Type=Flag,Description="In 3' UTR Location is in an untranslated region (UTR). FxnCode = 53">
##INFO=<ID=U5,Number=0,Type=Flag,Description="In 5' UTR Location is in an untranslated region (UTR). FxnCode = 55">
##INFO=<ID=VC,Number=1,Type=String,Description="Variation Class">
##INFO=<ID=VLD,Number=0,Type=Flag,Description="Is Validated. This bit is set if the variant has 2+ minor allele count based on frequency or genotype data.">
##INFO=<ID=VP,Number=1,Type=String,Description="Variation Property">
##INFO=<ID=WGT,Number=1,Type=Integer,Description="Weight, 00 - unmapped, 1 - weight 1, 2 - weight 2, 3 - weight 3 or more">
##INFO=<ID=WTD,Number=0,Type=Flag,Description="Is Withdrawn by submitter If one member ss is withdrawn by submitter, then this bit is set. If all member ss' are withdrawn, then the rs is deleted to SNPHistory">
##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description="First dbSNP Build for RS">
##LeftAlignVariants="analysis_type=LeftAlignVariants input_file=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false disable_experimental_low_memory_sharding=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=00-All.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub filter_mismatching_base_and_quals=false"
##contig=<ID=chr1,length=249250621,assembly=b37>
##phasing=partial
##reference=GRCh37.3
##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta
##source=dbSNP
##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 10144 rs144773400 TA T . PASS ASP;RSPOS=10145;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10234 rs145599635 C T . PASS ASP;RSPOS=10234;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 10248 rs148908337 A T . PASS ASP;RSPOS=10248;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 10254 rs140194106 TA T . PASS ASP;RSPOS=10255;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10291 rs145427775 C T . PASS ASP;RSPOS=10291;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 10327 rs112750067 T C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10327;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132
chr1 10329 rs150969722 AC A . PASS ASP;RSPOS=10330;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10351 rs145072688 CTA C,CA . PASS ASP;RSPOS=10352;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10382 rs147093981 AAC A,AC . PASS ASP;RSPOS=10383;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10433 rs56289060 A AC . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10433;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 10439 rs112766696 AC A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10440;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=132
chr1 10439 rs138941843 AC A . PASS ASP;RSPOS=10440;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134
chr1 10440 rs112155239 C A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10440;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132
chr1 10492 rs55998931 C T . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.0617001828153565;RSPOS=10492;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040000000100;WGT=0;dbSNPBuildID=129
chr1 10519 rs62636508 G C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10519;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129
chr1 10583 rs58108140 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.270566727605119;KGPilot123;RSPOS=10583;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=129
chr1 10611 rs189107123 C G . PASS KGPilot123;RSPOS=10611;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 10828 rs10218492 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10828;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119
chr1 10904 rs10218493 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10904;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=119
chr1 10927 rs10218527 A G . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10927;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119
chr1 10938 rs28853987 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10938;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125
chr1 11014 rs28484712 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11014;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125
chr1 11022 rs28775022 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11022;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125
chr1 11081 rs10218495 G T . PASS CFL;GENEINFO=LOC100652771:100652771;GNO;RSPOS=11081;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=119
chr1 11863 rs187669455 C A . PASS RSPOS=11863;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135
chr1 13302 rs180734498 C T . PASS KGPilot123;RSPOS=13302;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 13327 rs144762171 G C . PASS ASP;KGPilot123;RSPOS=13327;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 13684 rs71260404 C T . PASS GENEINFO=LOC100652771:100652771;GNO;RSPOS=13684;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130
chr1 13980 rs151276478 T C . PASS ASP;KGPilot123;RSPOS=13980;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 14889 rs142444908 G A . PASS ASP;RSPOS=14889;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 14907 rs79585140 A G . PASS GNO;RSPOS=14907;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131
chr1 14930 rs75454623 A G . PASS GNO;RSPOS=14930;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131
chr1 14976 rs71252251 G A . PASS ASP;GNO;RSPOS=14976;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=130
chr1 15061 rs71268703 T TG . PASS ASP;GNO;RSPOS=15061;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130
chr1 15118 rs71252250 A G . PASS ASP;GNO;RSPOS=15118;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=130
chr1 15211 rs144718396 T G . PASS ASP;RSPOS=15211;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 15211 rs78601809 T G . PASS ASP;GNO;RSPOS=15211;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131
chr1 16257 rs78588380 G C . PASS ASP;GNO;RSPOS=16257;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131
chr1 16378 rs148220436 T C . PASS ASP;RSPOS=16378;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 16495 rs141130360 G C . PASS ASP;RSPOS=16495;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 16497 rs150723783 A G . PASS ASP;RSPOS=16497;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 17519 rs192890528 G T . PASS RSPOS=17519;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135
chr1 19226 rs138930629 T A . PASS ASP;RSPOS=19226;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 20141 rs56336884 G A . PASS HD;RSPOS=20141;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000400000100;WGT=0;dbSNPBuildID=129
chr1 20144 rs143346096 G A . PASS ASP;RSPOS=20144;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 20206 rs71262675 C T . PASS GNO;RSPOS=20206;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130
chr1 20245 rs71262674 G A . PASS GMAF=0.256398537477148;GNO;RSPOS=20245;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130
chr1 20304 rs71262673 G C . PASS GMAF=0.338208409506399;GNO;RSPOS=20304;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130
chr1 26999 rs147506580 A G . PASS ASP;RSPOS=26999;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 29436 rs2462493 G A . PASS GNO;RSPOS=29436;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=100
chr1 30923 rs140337953 G T . PASS ASP;KGPilot123;RSPOS=30923;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 33487 rs77459554 C T . PASS ASP;GNO;RSPOS=33487;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131
chr1 33495 rs75468675 C T . PASS ASP;GNO;RSPOS=33495;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131
chr1 33505 rs75627161 T C . PASS ASP;GNO;RSPOS=33505;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131
chr1 33508 rs75609629 A T . PASS ASP;GNO;RSPOS=33508;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131
chr1 33521 rs76098219 T A . PASS GNO;RSPOS=33521;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131
chr1 33593 rs557585 G A . PASS RSPOS=33593;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=83
chr1 33648 rs62028204 G T . PASS RSPOS=33648;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=129
chr1 33656 rs113821789 T C . PASS RSPOS=33656;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=132
chr1 51476 rs187298206 T C . PASS KGPilot123;RSPOS=51476;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 51479 rs116400033 T A . PASS ASP;G5;G5A;GMAF=0.113802559414991;KGPilot123;RSPOS=51479;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004070010000100;WGT=0;dbSNPBuildID=132
chr1 51803 rs62637812 T C . PASS GMAF=0.468921389396709;RSPOS=51803;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040000000100;WGT=0;dbSNPBuildID=129
chr1 51898 rs76402894 C A . PASS GMAF=0.0731261425959781;GNO;RSPOS=51898;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=131
chr1 51914 rs190452223 T G . PASS KGPilot123;RSPOS=51914;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 51928 rs78732933 G A . PASS GNO;RSPOS=51928;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=131
chr1 51935 rs181754315 C T . PASS KGPilot123;RSPOS=51935;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 51954 rs185832753 G C . PASS KGPilot123;RSPOS=51954;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 52058 rs62637813 G C . PASS GMAF=0.0342778793418647;KGPilot123;RSPOS=52058;SAO=0;SSR=1;VC=SNV;VLD;VP=050000000000040010000140;WGT=0;dbSNPBuildID=129
chr1 52144 rs190291950 T A . PASS KGPilot123;RSPOS=52144;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 52238 rs150021059 T G . PASS ASP;KGPilot123;RSPOS=52238;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 54353 rs140052487 C A . PASS ASP;KGPilot123;RSPOS=54353;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 54421 rs146477069 A G . PASS ASP;KGPilot123;RSPOS=54421;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 54490 rs141149254 G A . PASS ASP;KGPilot123;RSPOS=54490;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 54676 rs2462492 C T . PASS ASP;GMAF=0.191956124314442;GNO;HD;KGPilot123;RSPOS=54676;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040510000100;WGT=0;dbSNPBuildID=100
chr1 54753 rs143174675 T G . PASS ASP;KGPilot123;RSPOS=54753;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 54788 rs59861892 CC C,CCT . PASS ASP;RSPOS=54789;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 54795 rs58014817 T A . PASS ASP;RSPOS=54795;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129
chr1 55164 rs3091274 C A . PASS G5;G5A;GMAF=0.145338208409506;GNO;KGPilot123;RSPOS=55164;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000030110000100;WGT=0;dbSNPBuildID=103
chr1 55299 rs10399749 C T . PASS G5;G5A;GMAF=0.278793418647166;GNO;KGPilot123;PH2;RSPOS=55299;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000030112000100;WGT=0;dbSNPBuildID=119
chr1 55302 rs3091273 C T . PASS RSPOS=55302;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=103
chr1 55313 rs182462964 A T . PASS KGPilot123;RSPOS=55313;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55322 rs3107974 C T . PASS RSPOS=55322;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=103
chr1 55326 rs3107975 T C . PASS GNO;HD;KGPilot123;RSPOS=55326;SAO=0;SSR=0;VC=SNV;VP=050000000000000510000100;WGT=0;dbSNPBuildID=103
chr1 55330 rs185215913 G A . PASS KGPilot123;RSPOS=55330;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55367 rs190850374 G A . PASS KGPilot123;RSPOS=55367;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55388 rs182711216 C T . PASS KGPilot123;RSPOS=55388;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55394 rs2949420 T A . PASS GNO;KGPilot123;PH2;RSPOS=55394;SAO=0;SSR=0;VC=SNV;VP=050000000000000112000100;WGT=0;dbSNPBuildID=101
chr1 55416 rs193242050 G A . PASS KGPilot123;RSPOS=55416;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55427 rs183189405 T C . PASS KGPilot123;RSPOS=55427;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55545 rs28396308 C T . PASS GNO;RSPOS=55545;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=125
chr1 55816 rs187434873 G A . PASS KGPilot123;RSPOS=55816;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55850 rs191890754 C G . PASS KGPilot123;RSPOS=55850;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 55852 rs184233019 G C . PASS KGPilot123;RSPOS=55852;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 56644 rs143342222 A C . PASS ASP;KGPilot123;RSPOS=56644;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 57952 rs189727433 A C . PASS KGPilot123;RSPOS=57952;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 58771 rs140128481 T C . PASS ASP;RSPOS=58771;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134
chr1 58814 rs114420996 G A . PASS ASP;G5;GMAF=0.0982632541133455;KGPilot123;RSPOS=58814;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004050010000100;WGT=0;dbSNPBuildID=132
chr1 59040 rs149755937 T C . PASS ASP;KGPilot123;RSPOS=59040;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 60718 rs78395614 G A . PASS CFL;GNO;RSPOS=60718;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131
chr1 60726 rs192328835 C A . PASS KGPilot123;RSPOS=60726;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 60791 rs76199781 A G . PASS CFL;GNO;RSPOS=60791;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131
chr1 61442 rs74970982 A G . PASS CFL;GMAF=0.076782449725777;GNO;KGPilot123;RSPOS=61442;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131
chr1 61462 rs56992750 T A . PASS CFL;G5;G5A;GMAF=0.0383912248628885;GNO;KGPilot123;RSPOS=61462;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008030110000100;WGT=0;dbSNPBuildID=129
chr1 61480 rs75526266 G C . PASS CFL;GNO;RSPOS=61480;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131
chr1 61499 rs75719746 G A . PASS CFL;GNO;RSPOS=61499;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131
chr1 61743 rs184286948 G C . PASS KGPilot123;RSPOS=61743;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 61920 rs62637820 G A . PASS CFL;GMAF=0.0255941499085923;RSPOS=61920;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=129
chr1 61987 rs76735897 A G . PASS CFL;GMAF=0.292961608775137;GNO;KGPilot123;RSPOS=61987;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131
chr1 61989 rs77573425 G C . PASS CFL;GMAF=0.309414990859232;GNO;KGPilot123;RSPOS=61989;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131
chr1 61993 rs190553843 C T . PASS KGPilot123;RSPOS=61993;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 62156 rs181864839 C T . PASS KGPilot123;RSPOS=62156;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 62157 rs10399597 G A . PASS CFL;GMAF=0.00228519195612431;KGPilot123;RSPOS=62157;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=119
chr1 62162 rs140556834 G A . PASS ASP;KGPilot123;RSPOS=62162;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 62203 rs28402963 T C . PASS CFL;KGPilot123;RSPOS=62203;SAO=0;SSR=0;VC=SNV;VP=050000000008000010000100;WGT=0;dbSNPBuildID=125
chr1 62271 rs28599927 A G . PASS CFL;GMAF=0.138482632541133;RSPOS=62271;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=125
chr1 63268 rs75478250 T C . PASS CFL;GNO;RSPOS=63268;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131
chr1 63276 rs185977555 G A . PASS KGPilot123;RSPOS=63276;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 63297 rs188886746 G A . PASS KGPilot123;RSPOS=63297;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 63671 rs116440577 G A . PASS ASP;G5;GMAF=0.170018281535649;KGPilot123;RSPOS=63671;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004050010000100;WGT=0;dbSNPBuildID=132
chr1 63737 rs77426996 TACT T,TCTA . PASS CFL;RSPOS=63738;SAO=0;SSR=0;VC=DIV;VP=050000000008000000000200;WGT=0;dbSNPBuildID=131
chr1 64649 rs181431124 A C . PASS KGPilot123;RSPOS=64649;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 66008 rs2691286 C G . PASS CFL;GNO;RSPOS=66008;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=100
chr1 66162 rs62639105 A T . PASS CFL;GMAF=0.320383912248629;GNO;KGPilot123;RSPOS=66162;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000110000100;WGT=0;dbSNPBuildID=129
chr1 66176 rs28552463 T A . PASS CFL;GMAF=0.0484460694698355;KGPilot123;RSPOS=66176;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=125
chr1 66219 rs181028663 A T . PASS KGPilot123;RSPOS=66219;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 66238 rs113961546 T A . PASS CFL;GNO;RSPOS=66238;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=132
chr1 66314 rs28534012 T A . PASS CFL;RSPOS=66314;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=125
chr1 66331 rs186063952 A C . PASS KGPilot123;RSPOS=66331;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 66334 rs28464214 T A . PASS CFL;RSPOS=66334;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=125
chr1 66442 rs192044252 T A . PASS KGPilot123;RSPOS=66442;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 66457 rs13328655 T A . PASS CFL;GMAF=0.0795246800731261;KGPilot123;RSPOS=66457;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=121
chr1 66503 rs112350669 T A . PASS CFL;RSPOS=66503;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=132
chr1 66507 rs12401368 T A . PASS CFL;GMAF=0.479890310786106;KGPilot123;RSPOS=66507;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=120
chr1 66651 rs2257270 A T . PASS CFL;GNO;RSPOS=66651;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=100
chr1 67179 rs149952626 C G . PASS ASP;KGPilot123;RSPOS=67179;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 67181 rs77662731 A G . PASS ASP;G5;G5A;GENEINFO=OR4F5:79501;GMAF=0.0470749542961609;GNO;KGPilot123;RSPOS=67181;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004070110000100;WGT=0;dbSNPBuildID=131
chr1 67223 rs78676975 C A . PASS ASP;GENEINFO=OR4F5:79501;GNO;RSPOS=67223;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131
chr1 69428 rs140739101 T G . PASS ASP;RSPOS=69428;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134
chr1 69453 rs142004627 G A . PASS ASP;RSPOS=69453;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134
chr1 69476 rs148502021 T C . PASS ASP;RSPOS=69476;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134
chr1 69496 rs150690004 G A . PASS ASP;RSPOS=69496;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134
chr1 69511 rs75062661 A G . PASS GENEINFO=OR4F5:79501;GMAF=0.193784277879342;GNO;KGPilot123;RSPOS=69511;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000000000110000100;WGT=0;dbSNPBuildID=131
chr1 69534 rs190717287 T C . PASS KGPilot123;RSPOS=69534;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000000000010000100;WGT=0;dbSNPBuildID=135
chr1 69552 rs55874132 G C . PASS GENEINFO=OR4F5:79501;HD;RSPOS=69552;S3D;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050300000000040400000100;WGT=0;dbSNPBuildID=129
chr1 69590 rs141776804 T A . PASS ASP;RSPOS=69590;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134
chr1 69594 rs144967600 T C . PASS ASP;RSPOS=69594;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134
chr1 72148 rs182862337 C T . PASS KGPilot123;RSPOS=72148;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 73841 rs143773730 C T . PASS ASP;KGPilot123;RSPOS=73841;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 74651 rs62641291 G A . PASS RSPOS=74651;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=129
chr1 74681 rs13328683 G T . PASS CFL;GMAF=0.286106032906764;RSPOS=74681;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=121
chr1 74709 rs62641292 T A . PASS CFL;RSPOS=74709;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=129
chr1 74771 rs13328675 A G . PASS CFL;RSPOS=74771;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121
chr1 74790 rs13328700 C G . PASS CFL;RSPOS=74790;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121
chr1 74792 rs13328684 G A . PASS CFL;RSPOS=74792;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121
chr1 77462 rs188023513 G A . PASS KGPilot123;RSPOS=77462;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 77470 rs192898053 T C . PASS KGPilot123;RSPOS=77470;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 77874 rs184538873 G A . PASS KGPilot123;RSPOS=77874;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 77961 rs78385339 G A . PASS GMAF=0.125685557586837;KGPilot123;RSPOS=77961;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040010000100;WGT=0;dbSNPBuildID=131
chr1 79033 rs62641298 A G . PASS GMAF=0.438299817184644;GNO;HD;KGPilot123;RSPOS=79033;SAO=0;SSR=0;VC=SNV;VP=050000000000000510000100;WGT=0;dbSNPBuildID=129
chr1 79050 rs62641299 G T . PASS GMAF=0.224405850091408;GNO;KGPilot123;RSPOS=79050;SAO=0;SSR=0;VC=SNV;VP=050000000000000110000100;WGT=0;dbSNPBuildID=129
chr1 79137 rs143777184 A T . PASS ASP;KGPilot123;RSPOS=79137;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 79417 rs184768190 C T . PASS KGPilot123;RSPOS=79417;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 79418 rs2691296 G C . PASS GMAF=0.0178244972577697;RSPOS=79418;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000000040000000100;WGT=0;dbSNPBuildID=100
chr1 79538 rs2691295 C T . PASS RSPOS=79538;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=100
chr1 79772 rs147215883 C G . PASS ASP;KGPilot123;RSPOS=79772;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 79872 rs189224661 T G . PASS KGPilot123;RSPOS=79872;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 80323 rs3942603 G C . PASS CFL;GNO;RSPOS=80323;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=108
chr1 80386 rs3878915 C A . PASS GMAF=0.0118829981718464;RSPOS=80386;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000000040000000100;WGT=0;dbSNPBuildID=108
chr1 80454 rs144226842 G C . PASS ASP;KGPilot123;RSPOS=80454;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 81836 rs2259560 A T . PASS ASP;GNO;RSPOS=81836;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=100
chr1 81949 rs181567186 T C . PASS KGPilot123;RSPOS=81949;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 81962 rs4030308 T TAA . PASS ASP;RSPOS=81962;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108
chr1 82102 rs4030307 C T . PASS ASP;RSPOS=82102;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108
chr1 82103 rs2020400 T C . PASS ASP;RSPOS=82103;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=92
chr1 82126 rs1815133 C T . PASS ASP;RSPOS=82126;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=92
chr1 82133 rs4030306 CA C,CAAAAAAAAAAAAAAA . PASS ASP;RSPOS=82136;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108
chr1 82154 rs4477212 A G . PASS ASP;HD;RSPOS=82154;SAO=0;SSR=0;VC=SNV;VP=050000000004000400000100;WGT=0;dbSNPBuildID=111
chr1 82162 rs1815132 C A . PASS ASP;GMAF=0.0351919561243144;GNO;RSPOS=82162;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=92
chr1 82163 rs139113303 G A . PASS ASP;KGPilot123;RSPOS=82163;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 82196 rs112844054 A T . PASS ASP;RSPOS=82196;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132
chr1 82249 rs1851945 A G . PASS ASP;GMAF=0.0452468007312614;KGPilot123;RSPOS=82249;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000004040010000100;WGT=0;dbSNPBuildID=92
chr1 82282 rs3871775 G A . PASS ASP;RSPOS=82282;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108
chr1 82303 rs3871776 T C . PASS ASP;RSPOS=82303;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108
chr1 82316 rs4030305 A C . PASS ASP;GNO;RSPOS=82316;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=108
chr1 82609 rs149189449 C G . PASS ASP;KGPilot123;RSPOS=82609;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134
chr1 82676 rs185237834 T G . PASS KGPilot123;RSPOS=82676;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 82734 rs4030331 T C . PASS ASP;GMAF=0.261882998171846;KGPilot123;RSPOS=82734;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000004040010000100;WGT=0;dbSNPBuildID=108
chr1 82957 rs189774606 C T . PASS KGPilot123;RSPOS=82957;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 83084 rs181193408 T A . PASS KGPilot123;RSPOS=83084;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 83088 rs186081601 G C . PASS KGPilot123;RSPOS=83088;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 83107 rs4405097 G C . PASS ASP;RSPOS=83107;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=111
chr1 83119 rs4030324 AA A,ATAAC . PASS ASP;RSPOS=83120;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108
chr1 83771 rs189906733 T G . PASS KGPilot123;RSPOS=83771;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 83786 rs58520670 T TA . PASS ASP;RSPOS=83794;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83815 rs58857344 GAGAA G . PASS ASP;RSPOS=83827;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83826 rs71281475 AAAGA A,AAA . PASS ASP;GNO;RSPOS=83827;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130
chr1 83855 rs59596480 GAA G . PASS ASP;RSPOS=83857;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83872 rs59556914 AA A,AAGA . PASS ASP;RSPOS=83873;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83884 rs59586754 GAAA G . PASS ASP;RSPOS=83885;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83897 rs61330047 GAA G . PASS ASP;RSPOS=83899;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83901 rs58254183 GAAAGAA G . PASS ASP;RSPOS=83903;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83921 rs61338823 GAA G . PASS ASP;RSPOS=83923;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83930 rs71281474 AG A,AGA . PASS ASP;GNO;RSPOS=83931;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130
chr1 83934 rs59235392 AG A,AGAAA . PASS ASP;RSPOS=83935;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 83977 rs180759811 A G . PASS KGPilot123;RSPOS=83977;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84002 rs28850140 G A . PASS ASP;GMAF=0.138939670932358;KGPilot123;RSPOS=84002;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=125
chr1 84010 rs186443818 G A . PASS KGPilot123;RSPOS=84010;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84018 rs61352176 GAA G . PASS ASP;RSPOS=84020;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129
chr1 84079 rs190867312 T C . PASS KGPilot123;RSPOS=84079;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84139 rs183605470 A T . PASS KGPilot123;RSPOS=84139;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84156 rs188652299 A C . PASS KGPilot123;RSPOS=84156;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84244 rs191297051 A C . PASS KGPilot123;RSPOS=84244;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84295 rs183209871 G A . PASS KGPilot123;RSPOS=84295;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84346 rs187855973 T C . PASS KGPilot123;RSPOS=84346;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84453 rs191379015 C G . PASS KGPilot123;RSPOS=84453;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135
chr1 84705 rs183470350 T G . PASS KGPilot123;RSPOS=84705;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135

View File

@ -0,0 +1 @@
100000 1 0

View File

@ -0,0 +1,3 @@
100000 1 11
0 chr1 (null)
0 100000 0

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.