From 2afe3f7a215616457d281c0bb832a7188eea76c1 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Thu, 18 Jun 2015 12:45:14 -0400 Subject: [PATCH] Make GenotypeGVCFs subset Strand Allele Counts intelligently --- .../walkers/annotator/AnnotationUtils.java | 50 ++-- .../walkers/annotator/DepthPerSampleHC.java | 5 +- .../annotator/StrandAlleleCountsBySample.java | 5 +- .../walkers/annotator/StrandBiasBySample.java | 6 +- .../GeneralPloidyGenotypeLikelihoods.java | 8 +- .../GenotypeLikelihoodCalculators.java | 4 - .../GeneralPloidyExactAFCalculator.java | 157 +++++++----- ...ferenceConfidenceVariantContextMerger.java | 69 ++++-- .../VariantAnnotatorIntegrationTest.java | 2 +- .../GenotypeGVCFsIntegrationTest.java | 48 +++- .../SelectVariantsIntegrationTest.java | 36 +++ .../walkers/variantutils/VariantUtils.java | 99 ++++++++ .../variantutils/VariantUtilsUnitTest.java | 74 ++++++ .../walkers/variantutils/SelectVariants.java | 2 +- .../variant/GATKVariantContextUtils.java | 227 +++++++++++++++--- .../GATKVariantContextUtilsUnitTest.java | 93 ++++--- 16 files changed, 691 insertions(+), 194 deletions(-) create mode 100644 protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtils.java create mode 100644 protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtilsUnitTest.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java index c500c1b3c..48119e9a2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java @@ -60,9 +60,13 @@ import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; public class AnnotationUtils { + public static final String ANNOTATION_HC_WARN_MSG = " annotation will not be calculated, must be called from HaplotyepCaller"; + public static final int WARNINGS_LOGGED_SIZE = 3; + /** * Checks if the input data is appropriate * + * @param annotation the input genotype annotation key name(s) * @param walker input walker * @param map input map for each read, holds underlying alleles represented by an aligned read, and corresponding relative likelihood. * @param g input genotype @@ -70,20 +74,38 @@ public class AnnotationUtils { * @param logger logger specific for each caller * * @return true if the walker is a HaplotypeCaller, the likelihood map is non-null and the genotype is non-null and called, false otherwise - * @throws ReviewedGATKException if the size of warningsLogged is less than 4. + * @throws IllegalArgumentException if annotation, walker, g, warningsLogged, or logger are null. + * @throws ReviewedGATKException if the size of warningsLogged is less than 3. */ - public static boolean isAppropriateInput(final AnnotatorCompatible walker, final PerReadAlleleLikelihoodMap map, final Genotype g, final boolean[] warningsLogged, final Logger logger) { + public static boolean isAppropriateInput(final String annotation, final AnnotatorCompatible walker, final PerReadAlleleLikelihoodMap map, final Genotype g, final boolean[] warningsLogged, final Logger logger) { - if ( warningsLogged.length < 4 ){ - throw new ReviewedGATKException("Warnings logged array must have at last 4 elements, but has " + warningsLogged.length); + if ( annotation == null ){ + throw new IllegalArgumentException("The input annotation cannot be null"); + } + + if ( walker == null ) { + throw new IllegalArgumentException("The input walker cannot be null"); + } + + if ( g == null ) { + throw new IllegalArgumentException("The input genotype cannot be null"); + } + + if ( warningsLogged == null ){ + throw new IllegalArgumentException("The input warnings logged cannot be null"); + } + + if ( logger == null ){ + throw new IllegalArgumentException("The input logger cannot be null"); + } + + if ( warningsLogged.length < WARNINGS_LOGGED_SIZE ){ + throw new ReviewedGATKException("Warnings logged array must have at least " + WARNINGS_LOGGED_SIZE + " elements, but has " + warningsLogged.length); } if ( !(walker instanceof HaplotypeCaller) ) { if ( !warningsLogged[0] ) { - if ( walker != null ) - logger.warn("Annotation will not be calculated, must be called from HaplotyepCaller, not " + walker.getClass().getName()); - else - logger.warn("Annotation will not be calculated, must be called from HaplotyepCaller"); + logger.warn(annotation + ANNOTATION_HC_WARN_MSG + ", not " + walker.getClass().getSimpleName()); warningsLogged[0] = true; } return false; @@ -97,18 +119,10 @@ public class AnnotationUtils { return false; } - if ( g == null ){ - if ( !warningsLogged[2] ) { - logger.warn("Annotation will not be calculated, missing genotype"); - warningsLogged[2]= true; - } - return false; - } - if ( !g.isCalled() ){ - if ( !warningsLogged[3] ) { + if ( !warningsLogged[2] ) { logger.warn("Annotation will not be calculated, genotype is not called"); - warningsLogged[3] = true; + warningsLogged[2] = true; } return false; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java index 318309b57..6c769eed7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; +import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; @@ -94,7 +95,7 @@ import java.util.*; public class DepthPerSampleHC extends GenotypeAnnotation { private final static Logger logger = Logger.getLogger(DepthPerSampleHC.class); private boolean alleleLikelihoodMapSubsetWarningLogged = false; - boolean[] warningsLogged = new boolean[4]; + private final boolean[] warningsLogged = new boolean[AnnotationUtils.WARNINGS_LOGGED_SIZE]; @Override public void annotate(final RefMetaDataTracker tracker, @@ -106,7 +107,7 @@ public class DepthPerSampleHC extends GenotypeAnnotation { final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - if ( !AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger) ) { + if ( !AnnotationUtils.isAppropriateInput(VCFConstants.DEPTH_KEY , walker, alleleLikelihoodMap, g, warningsLogged, logger) ) { return; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java index abe330883..902d40ddb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java @@ -56,6 +56,7 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; +import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; @@ -105,7 +106,7 @@ import java.util.Map; public class StrandAlleleCountsBySample extends GenotypeAnnotation { private final static Logger logger = Logger.getLogger(StrandAlleleCountsBySample.class); - boolean[] warningsLogged = new boolean[4]; + private final boolean[] warningsLogged = new boolean[AnnotationUtils.WARNINGS_LOGGED_SIZE]; @Override public void annotate(final RefMetaDataTracker tracker, @@ -117,7 +118,7 @@ public class StrandAlleleCountsBySample extends GenotypeAnnotation { final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( !AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger) ) { + if ( !AnnotationUtils.isAppropriateInput(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, walker, alleleLikelihoodMap, g, warningsLogged, logger) ) { return; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java index 115a68d20..dc08f58c7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.apache.log4j.Logger; +import org.apache.commons.lang.StringUtils; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; @@ -99,7 +100,7 @@ import java.util.*; public class StrandBiasBySample extends GenotypeAnnotation { private final static Logger logger = Logger.getLogger(StrandBiasBySample.class); - boolean[] warningsLogged = new boolean[4]; + private final boolean[] warningsLogged = new boolean[AnnotationUtils.WARNINGS_LOGGED_SIZE]; @Override public void annotate(final RefMetaDataTracker tracker, @@ -110,8 +111,7 @@ public class StrandBiasBySample extends GenotypeAnnotation { final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - - if (!AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger)) { + if (!AnnotationUtils.isAppropriateInput(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, walker, alleleLikelihoodMap, g, warningsLogged, logger)) { return; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java index 6a80c7b92..1f4202418 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java @@ -143,11 +143,11 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * of form int[] -> double (to be more precise, IntArrayWrapper -> Double). * For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically * across all possible allele conformations. - * Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P, + * Problem equivalent to listing in deterministic order all possible ways in which N integers will sum to P, * where N is number of alleles and P is number of chromosomes. * There's an option to list all integers so that sum will be UP to P. * For example, with P=2,N=2, restrictSumTo = 2 iterator will produce - * [2 0 ] [1 1] [ 0 2] + * [2 0] [1 1] [0 2] * * */ @@ -331,7 +331,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * @param numChromosomes Ploidy (number of chromosomes describing PL's) * @param originalAlleles List of original alleles * @param allelesToSubset Alleles to subset - * @return Vector of new PL's, ordered accorrding to SumIterator's ordering + * @return Vector of new PL's, ordered according to SumIterator's ordering */ public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes, final List originalAlleles, final List allelesToSubset) { @@ -339,14 +339,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods { int newPLSize = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes); double[] newPLs = new double[newPLSize]; - int idx = 0; // First fill boolean array stating whether each original allele is present in new mapping final boolean [] allelePresent = new boolean[originalAlleles.size()]; for ( Allele allele : originalAlleles ) allelePresent[idx++] = allelesToSubset.contains(allele); - // compute mapping from old idx to new idx // This might be needed in case new allele set is not ordered in the same way as old set // Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1] diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java index cb2578a20..ad5b8a9a9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculators.java @@ -317,10 +317,6 @@ public class GenotypeLikelihoodCalculators { public static GenotypeLikelihoodCalculator getInstance(final int ploidy, final int alleleCount) { checkPloidyAndMaximumAllele(ploidy, alleleCount); - if (alleleCount < 0) - throw new IllegalArgumentException("the allele count cannot be negative"); - if (ploidy < 0) - throw new IllegalArgumentException("the ploidy count cannot be negative"); // Non-thread safe (fast) check on tables capacities, // if not enough capacity we expand the tables in a thread-safe manner: diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java index d4c26b7e5..5c5d1eac0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java @@ -59,6 +59,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalcula import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.*; @@ -71,7 +72,7 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { @Override protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final int defaultPloidy, final List allelesToUse) { - return subsetAlleles(vc,defaultPloidy,allelesToUse,false); + return subsetAlleles(vc, defaultPloidy, allelesToUse, false); } @Override @@ -405,79 +406,125 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { /** * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, - * including updating the PL's, and assign genotypes accordingly + * including updating the PLs, ADs and SACs, and assign genotypes accordingly * @param vc variant context with alleles and genotype likelihoods * @param defaultPloidy ploidy to assume in case that {@code vc} does not contain that information * for a sample. * @param allelesToUse alleles to subset * @param assignGenotypes true: assign hard genotypes, false: leave as no-call - * @return GenotypesContext with new PLs and AD. + * @return GenotypesContext with new PLs, SACs and AD. */ @Override public GenotypesContext subsetAlleles(final VariantContext vc, final int defaultPloidy, final List allelesToUse, final boolean assignGenotypes) { - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + final GenotypesContext result = GenotypesContext.create(); - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); + // Subset genotypes for each sample + for (final Genotype g : vc.getGenotypes()) // If it really needs to process order by sample name do so. + result.add(subsetGenotypeAlleles(g, allelesToUse, vc, defaultPloidy, assignGenotypes)); + return GATKVariantContextUtils.fixADFromSubsettedAlleles(result, vc, allelesToUse); + } + + /** + * From a given genotype, extract a given subset of alleles and update genotype PLs and SACs. + * @param g genotype to subset + * @param allelesToUse alleles to subset + * @param vc variant context with alleles and genotypes + * @param defaultPloidy ploidy to assume in case that {@code vc} does not contain that information for a sample. + * @param assignGenotypes true: assign hard genotypes, false: leave as no-call + * @return Genotypes with new PLs and SACs + */ + private Genotype subsetGenotypeAlleles(final Genotype g, final List allelesToUse, final VariantContext vc, final int defaultPloidy, + boolean assignGenotypes) { + final int ploidy = g.getPloidy() <= 0 ? defaultPloidy : g.getPloidy(); + if (!g.hasLikelihoods()) + return GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy)); + else { + // subset likelihood alleles + final double[] newLikelihoods = subsetLikelihoodAlleles(g, allelesToUse, vc, ploidy); + if (MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL) + return GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy)); + else // just now we would care about newSACs + return subsetGenotypeAllelesWithLikelihoods(g, allelesToUse, vc, ploidy, assignGenotypes, newLikelihoods); + } + } + + /** + * From a given genotype, extract a given subset of alleles and return the new PLs + * @param g genotype to subset + * @param allelesToUse alleles to subset + * @param vc variant context with alleles and genotypes + * @param ploidy number of chromosomes + * @return the subsetted PLs + */ + private double[] subsetLikelihoodAlleles(final Genotype g, final List allelesToUse, final VariantContext vc, final int ploidy){ // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final int numNewAltAlleles = allelesToUse.size() - 1; + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - final int declaredPloidy = g.getPloidy(); - final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; - if ( !g.hasLikelihoods() ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy))); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - - // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization - // and subsetting - if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy))); - } - else { - final GenotypeBuilder gb = new GenotypeBuilder(g); - - if ( numNewAltAlleles == 0 ) - gb.noPL(); - else - gb.PL(newLikelihoods); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) - gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); - else - assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); - newGTs.add(gb.make()); - } + if ( numOriginalAltAlleles != numNewAltAlleles ) { + // might need to re-normalize the new likelihoods + return MathUtils.normalizeFromLog10(GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse), + false, true); } + else + return originalLikelihoods; + } - return GATKVariantContextUtils.fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); + /** + * From a given genotype, subset the PLs and SACs + * @param g genotype to subset + * @param allelesToUse alleles to subset + * @param vc variant context with alleles and genotypes + * @param ploidy number of chromosomes + * @param assignGenotypes true: assign hard genotypes, false: leave as no-call + * @param newLikelihoods the PL values + * @return genotype with the subsetted PLsL and SACs + */ + private Genotype subsetGenotypeAllelesWithLikelihoods(final Genotype g, final List allelesToUse, final VariantContext vc, int ploidy, + final boolean assignGenotypes, final double[] newLikelihoods) { + + final GenotypeBuilder gb = new GenotypeBuilder(g); + + // add likelihoods + gb.PL(newLikelihoods); + + // get and add subsetted SACs + final int[] newSACs = subsetSACAlleles(g, allelesToUse, vc); + if (newSACs != null) + gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, newSACs); + if (assignGenotypes) + assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); + else + gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); + + return gb.make(); + } + + /** + * From a given genotype, extract a given subset of alleles and return the new SACs + * @param g genotype to subset + * @param allelesToUse alleles to subset + * @param vc variant context with alleles and genotypes + * @return the subsetted SACs + */ + private int[] subsetSACAlleles(final Genotype g, final List allelesToUse, final VariantContext vc){ + + if ( !g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY) ) + return null; + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + final List sacIndexesToUse = numOriginalAltAlleles == numNewAltAlleles ? null : GATKVariantContextUtils.determineSACIndexesToUse(vc, allelesToUse); + + return GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse); } /** @@ -485,7 +532,7 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { * * @param newLikelihoods the PL array * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) - * @param numChromosomes Number of chromosomes per pool + * @param numChromosomes Number of chromosomes per pool */ private void assignGenotype(final GenotypeBuilder gb, final double[] newLikelihoods, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index 5fa4e5d14..845cfcefa 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -59,6 +59,7 @@ import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; @@ -90,7 +91,7 @@ public class ReferenceConfidenceVariantContextMerger { public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele, final boolean samplesAreUniquified) { // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs - if ( VCs == null || VCs.size() == 0 ) + if ( VCs == null || VCs.isEmpty() ) return null; // establish the baseline info (sometimes from the first VC) @@ -134,7 +135,7 @@ public class ReferenceConfidenceVariantContextMerger { vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet))); } - // Add and to the end if at all required in in the output. + // Add and to the end if at all required in the output. if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(Allele.SPAN_DEL); if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); @@ -349,46 +350,72 @@ public class ReferenceConfidenceVariantContextMerger { * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. * * @param mergedGenotypes the genotypes context to add to - * @param VC the Variant Context for the sample + * @param vc the Variant Context for the sample * @param remappedAlleles the list of remapped alleles for the sample * @param targetAlleles the list of target alleles * @param samplesAreUniquified true if sample names have been uniquified */ private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, - final VariantContext VC, + final VariantContext vc, final List remappedAlleles, final List targetAlleles, final boolean samplesAreUniquified) { - final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); + final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; - final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - int[] perSampleIndexesOfRelevantAlleles; + final int maximumAlleleCount = Math.max(remappedAlleles.size(), targetAlleles.size()); - for ( final Genotype g : VC.getGenotypes() ) { + for ( final Genotype g : vc.getGenotypes() ) { final String name; if (samplesAreUniquified) - name = g.getSampleName() + "." + VC.getSource(); + name = g.getSampleName() + "." + vc.getSource(); else name = g.getSampleName(); final int ploidy = g.getPloidy(); final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); genotypeBuilder.name(name); - if (g.hasPL()) { - // lazy initialization of the genotype index map by ploidy. - perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g); - final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null - ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles) - : genotypeIndexMapsByPloidy[ploidy]; - final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; - genotypeBuilder.PL(PLs).AD(AD); + final boolean hasPL = g.hasPL(); + final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); + if ( hasPL || hasSAC ) { + final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); + if (g.hasPL()) { + // genotype index map by ploidy. + final int[] genotypeIndexMapByPloidy = GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles); + final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); + final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; + genotypeBuilder.PL(PLs).AD(AD); + } + if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) { + final List sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles); + final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse); + genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs); + } } mergedGenotypes.add(genotypeBuilder.make()); } } + /** + * Adapt the relevant alleles to the SAC indexes + * + * @param perSampleIndexesOfRelevantAlleles + * @return SAC indexes + */ + private static List adaptToSACIndexes(final int[] perSampleIndexesOfRelevantAlleles){ + if ( perSampleIndexesOfRelevantAlleles == null ) + throw new IllegalArgumentException("The per sample index of relevant alleles must not be null"); + + final List sacIndexesToUse = new ArrayList(2*perSampleIndexesOfRelevantAlleles.length); + + for ( int item : perSampleIndexesOfRelevantAlleles ) { + sacIndexesToUse.add(new Integer(2*item)); + sacIndexesToUse.add(new Integer(2*item+1)); + } + + return sacIndexesToUse; + } + /** * Composes a new likelihood array given the original genotype and the genotype index map. * @@ -412,7 +439,7 @@ public class ReferenceConfidenceVariantContextMerger { } /** - * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * Determines the allele mapping from remappedAlleles to the targetAlleles, substituting the generic "" as appropriate. * If the myAlleles set does not contain "" as an allele, it throws an exception. * * @param remappedAlleles the list of alleles to evaluate @@ -423,8 +450,8 @@ public class ReferenceConfidenceVariantContextMerger { */ protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position, final Genotype g) { - if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); - if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + if ( remappedAlleles == null || remappedAlleles.isEmpty() ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.isEmpty() ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); if ( !remappedAlleles.contains(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) throw new UserException("The list of input alleles must contain " + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java index 3ff601593..eb1c20ba2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -163,7 +163,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("test file has annotations, adding StrandAlleleCountsBySample annotation", spec); File file = new File(logFileName); - Assert.assertTrue(FileUtils.readFileToString(file).contains("Annotation will not be calculated, must be called from HaplotyepCaller")); + Assert.assertTrue(FileUtils.readFileToString(file).contains(AnnotationUtils.ANNOTATION_HC_WARN_MSG)); } @Test diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 7f596da6d..b11da9962 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -56,7 +56,9 @@ import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCodec; +import org.apache.commons.io.FileUtils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.tools.walkers.annotator.AnnotationUtils; import org.testng.Assert; import org.testng.annotations.Test; @@ -87,12 +89,16 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { } @Test(enabled = true) - public void testUpdatePGTStrandAlleleCountsBySample() { + public void testUpdatePGTStrandAlleleCountsBySample() throws IOException{ + final String logFileName = new String("testUpdatePGTStrandAlleleCountsBySample.log"); WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference), + baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample -log " + logFileName, b37KGReference), 1, Arrays.asList("5deed67f8eb10cbd4429d70e0c26ef7c")); - executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec); + executeTest("testUpdatePGTStrandAlleleCountsBySample", spec); + + File file = new File(logFileName); + Assert.assertTrue(FileUtils.readFileToString(file).contains(AnnotationUtils.ANNOTATION_HC_WARN_MSG)); } @Test(enabled = true) @@ -246,20 +252,16 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-variant_index_parameter 128000 -A StrandAlleleCountsBySample", 1, Arrays.asList("") ); - specHaplotypeCaller.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SC + specHaplotypeCaller.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC final File gVCF = executeTest("testStrandAlleleCountsBySampleHaplotypeCaller", specHaplotypeCaller).getFirst().get(0); - List gVCFList = getAttributeValues(gVCF, new String("SAC")); //Use gVCF from HaplotypeCaller final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), 1, - Arrays.asList("")); - final File outputVCF = executeTest("testStrandAlleleCountsBySample", spec).getFirst().get(0); - List outputVCFList = getAttributeValues(outputVCF, new String("SAC")); - - // All of the SAC values in the VCF were derived from the gVCF - Assert.assertTrue(gVCFList.containsAll(outputVCFList)); + Arrays.asList("382e800e004139f861acf1e54767b104")); + spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC + executeTest("testStrandAlleleCountsBySample", spec); } @Test @@ -519,10 +521,32 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testBadADPropagationHaploidBugTest() { WalkerTestSpec spec = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + - " -V " + privateTestDir + "ad-bug-input.vcf", + " -V " + privateTestDir + "ad-bug-input.vcf", 1, Arrays.asList("027f96584e91ca8255764fbf38293963")); spec.disableShadowBCF(); executeTest("testBadADPropagationHaploidBugTest", spec); } + + @Test(enabled = true) + public void testSAC() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "261_S01_raw_variants_gvcf.vcf", + 1, + Arrays.asList("09da4a0fb937efab228413d1162fde2d")); + spec.disableShadowBCF(); + executeTest("testSAC", spec); + } + + @Test(enabled = true) + public void testSACMultisampleTetraploid() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "tetraploid-multisample-sac.g.vcf", + 1, + Arrays.asList("a8b7b300d6d5345b7b02b86d75671756")); + spec.disableShadowBCF(); + executeTest("testSACMultisampleTetraploid", spec); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index 77d9a7beb..9af334ff7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -733,4 +734,39 @@ public class SelectVariantsIntegrationTest extends WalkerTest { spec.disableShadowBCF(); executeTest("testSetFilteredGtoNocall--" + testfile, spec); } + @Test + public void testSACSimpleDiploid() { + String testfile = privateTestDir + "261_S01_raw_variants_gvcf.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -trimAlternates", + 1, + Arrays.asList("c9d297e7758bf5681270029401cc97c2")); + spec.disableShadowBCF(); + executeTest("testSACSimpleDiploid", spec); + } + + @Test + public void testSACDiploid() { + String testfile = privateTestDir + "diploid-multisample-sac.g.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12891 -trimAlternates", + 1, + Arrays.asList("7aecb079b16448f0377b6b03069b2994")); + spec.disableShadowBCF(); + executeTest("testSACDiploid", spec); + } + + @Test + public void testSACNonDiploid() { + String testfile = privateTestDir + "tetraploid-multisample-sac.g.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12891 -trimAlternates", + 1, + ReviewedGATKException.class); + spec.disableShadowBCF(); + executeTest("testSACNonDiploid", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtils.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtils.java new file mode 100644 index 000000000..269527c4d --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtils.java @@ -0,0 +1,99 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import htsjdk.tribble.readers.LineIterator; +import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class VariantUtils { + + /** + * Returns a list of attribute values from a VCF file + * + * @param vcfFile VCF file + * @param attributeName attribute name + * + * @throws IOException if the file does not exist or can not be opened + * + * @return list of attribute values + */ + public static List getAttributeValues(final File vcfFile, final String attributeName) throws IOException { + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(vcfFile); + final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIteratorVCF); + + List attributeValues = new ArrayList(); + while (lineIteratorVCF.hasNext()) { + final String line = lineIteratorVCF.next(); + final VariantContext vc = codec.decode(line); + + for (final Genotype g : vc.getGenotypes()) { + if (g.hasExtendedAttribute(attributeName)) { + attributeValues.add((String) g.getExtendedAttribute(attributeName)); + } + } + } + + s.close(); + return attributeValues; + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtilsUnitTest.java new file mode 100644 index 000000000..2e1035209 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantUtilsUnitTest.java @@ -0,0 +1,74 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +public class VariantUtilsUnitTest extends BaseTest { + + @Test + public void testgetAttributeValues() throws IOException { + String pathname = privateTestDir + "261_S01_raw_variants_gvcf.vcf"; + String attributeName = "SAC"; + File vcfFile = new File(pathname); + List attributeValues = VariantUtils.getAttributeValues(vcfFile, attributeName); + Assert.assertEquals(attributeValues, Arrays.asList("33,43,25,23,0,0,0,0")); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index 5d7986eca..1367bf5c4 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -1021,7 +1021,7 @@ public class SelectVariants extends RodWalker implements TreeR final VariantContextBuilder builder = new VariantContextBuilder(sub); // if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values - GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc); + GenotypesContext newGC = GATKVariantContextUtils.updatePLsSACsAD(sub, vc); // since the VC has been subset (either by sample or allele), we need to strip out the MLE tags builder.rmAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 31c8404eb..3e4861cd9 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -35,6 +35,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import java.io.Serializable; import java.util.*; @@ -487,16 +488,19 @@ public class GATKVariantContextUtils { */ @Deprecated public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { + + if (repeatUnit == null) throw new IllegalArgumentException("the repeat unit cannot be null"); + if (testString == null) throw new IllegalArgumentException("the test string cannot be null"); + int numRepeats = 0; if (lookForward) { // look forward on the test string for (int start = 0; start < testString.length; start += repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else + final int end = start + repeatUnit.length; + final byte[] unit = Arrays.copyOfRange(testString,start, end); + if (!Arrays.equals(unit,repeatUnit)) break; + numRepeats++; } return numRepeats; } @@ -504,8 +508,8 @@ public class GATKVariantContextUtils { // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 // look forward on the test string for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); + final int end = start + repeatUnit.length; + final byte[] unit = Arrays.copyOfRange(testString, start, end); if(Arrays.equals(unit,repeatUnit)) numRepeats++; else @@ -595,27 +599,35 @@ public class GATKVariantContextUtils { public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, final List allelesToUse, final GenotypeAssignmentMethod assignGenotypes) { + if ( vc == null ) throw new IllegalArgumentException("the VariantContext cannot be null"); + if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); // optimization: if no input genotypes, just exit if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); + // find the likelihoods indexes to use from the used alternate alleles + final List likelihoodIndexesToUse = determineDiploidLikelihoodIndexesToUse(vc, allelesToUse); + + // find the strand allele count indexes to use from the used alternate alleles + final List sacIndexesToUse = determineSACIndexesToUse(vc, allelesToUse); // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); + return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, sacIndexesToUse, assignGenotypes); } /** - * Figure out which likelihood indexes to use for a selected down set of alleles + * Find the likelihood indexes to use for a selected set of diploid alleles * * @param originalVC the original VariantContext * @param allelesToUse the subset of alleles to use * @return a list of PL indexes to use or null if none */ - private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + private static List determineDiploidLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + + if ( originalVC == null) throw new IllegalArgumentException("the original VariantContext cannot be null"); + if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); // the bitset representing the allele indexes we want to keep final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); @@ -625,21 +637,53 @@ public class GATKVariantContextUtils { if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) return null; - return getLikelihoodIndexes(originalVC, alleleIndexesToUse); + return getDiploidLikelihoodIndexes(originalVC, alleleIndexesToUse); } /** - * Get the actual likelihoods indexes to use given the corresponding allele indexes + * Find the strand allele count indexes to use for a selected set of alleles + * + * @param originalVC the original VariantContext + * @param allelesToUse the subset of alleles to use + * @return a list of SAC indexes to use or null if none + */ + public static List determineSACIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + + if ( originalVC == null ) throw new IllegalArgumentException("the original VC cannot be null"); + if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the SACs as is; otherwise, we determine which ones to keep + if (MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length) + return null; + + return getSACIndexes(alleleIndexesToUse); + } + + /** + * Get the actual likelihoods indexes to use given the corresponding diploid allele indexes * * @param originalVC the original VariantContext * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) * @return a non-null List */ - private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + private static List getDiploidLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + + if (originalVC == null) throw new IllegalArgumentException("the original VC cannot be null"); + if (alleleIndexesToUse == null) throw new IllegalArgumentException("the alleles to use cannot be null"); + + // All samples must be diploid + for ( final Genotype g : originalVC.getGenotypes() ){ + if ( g.getPloidy() != DEFAULT_PLOIDY ) + throw new ReviewedGATKException("All samples must be diploid"); + } final List result = new ArrayList<>(30); - // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + // numLikelihoods takes total # of alleles. final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { @@ -652,48 +696,137 @@ public class GATKVariantContextUtils { return result; } + /** + * Get the actual strand aleele counts indexes to use given the corresponding allele indexes + * + * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) + * @return a non-null List + */ + private static List getSACIndexes(final boolean[] alleleIndexesToUse) { + + if (alleleIndexesToUse == null) throw new IllegalArgumentException("the alleles to use cannot be null"); + if (alleleIndexesToUse.length == 0) throw new IllegalArgumentException("cannot have no alleles to use"); + + final List result = new ArrayList<>(2 * alleleIndexesToUse.length); + + for (int SACindex = 0; SACindex < alleleIndexesToUse.length; SACindex++) { + if (alleleIndexesToUse[SACindex]) { + result.add(2 * SACindex); + result.add(2 * SACindex + 1); + } + } + + return result; + } + /** * Given an original VariantContext and a list of alleles from that VC to keep, * returns a bitset representing which allele indexes should be kept * - * @param originalVC the original VC - * @param allelesToKeep the list of alleles to keep + * @param originalVC the original VC + * @param allelesToUse the list of alleles to keep * @return non-null bitset */ - private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { + private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToUse) { + + if (originalVC == null) throw new IllegalArgumentException("the original VC cannot be null"); + if (allelesToUse == null) throw new IllegalArgumentException("the alleles to use cannot be null"); + final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; // the reference Allele is definitely still used alleleIndexesToKeep[0] = true; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) - alleleIndexesToKeep[i+1] = true; + for (int i = 0; i < numOriginalAltAlleles; i++) { + if (allelesToUse.contains(originalVC.getAlternateAllele(i))) + alleleIndexesToKeep[i + 1] = true; } return alleleIndexesToKeep; } /** - * Create the new GenotypesContext with the subsetted PLs and ADs + * Make a new SAC array from the a subset of the genotype's original SAC + * + * @param g the genotype + * @param sacIndexesToUse the indexes in the SAC to use given the allelesToUse (@see #determineSACIndexesToUse()) + * @return subset of SACs from the original genotype, the original SACs if sacIndexesToUse is null + */ + public static int[] makeNewSACs(final Genotype g, final List sacIndexesToUse) { + + if (g == null) throw new IllegalArgumentException("the genotype cannot be null"); + + final int[] oldSACs = getSACs(g); + + if (sacIndexesToUse == null) { + return oldSACs; + } else { + final int[] newSACs = new int[sacIndexesToUse.size()]; + int newIndex = 0; + for (final int oldIndex : sacIndexesToUse) { + newSACs[newIndex++] = oldSACs[oldIndex]; + } + return newSACs; + } + } + + + /** + * Get the genotype SACs + * + * @param g the genotype + * @return an arrays of SACs + * @throws ReviewedGATKException if the type of the SACs is unexpected + */ + private static int[] getSACs(final Genotype g) { + + if ( g == null ) throw new IllegalArgumentException("the Genotype cannot be null"); + if ( !g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY) ) + throw new IllegalArgumentException("Genotype must have SAC"); + + if ( g.getExtendedAttributes().get(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY).getClass().equals(String.class) ) { + final String SACsString = (String) g.getExtendedAttributes().get(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); + ArrayList stringSACs = Utils.split(SACsString, ","); + final int[] intSACs = new int[stringSACs.size()]; + int i = 0; + for (String sac : stringSACs) + intSACs[i++] = Integer.parseInt(sac); + + return intSACs; + } + else if ( g.getExtendedAttributes().get(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY).getClass().equals(int[].class) ) + return (int[]) g.getExtendedAttributes().get(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); + else + throw new ReviewedGATKException("Unexpected SAC type"); + } + + /** + * Create the new GenotypesContext with the subsetted PLs, SACs and ADs * * @param originalGs the original GenotypesContext - * @param vc the original VariantContext + * @param originalVC the original VariantContext * @param allelesToUse the actual alleles to use with the new Genotypes - * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) + * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineDiploidLikelihoodIndexesToUse()) + * @param sacIndexesToUse the indexes in the SAC to use given the allelesToUse (@see #determineSACIndexesToUse()) * @param assignGenotypes assignment strategy for the (subsetted) PLs * @return a new non-null GenotypesContext */ private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, - final VariantContext vc, + final VariantContext originalVC, final List allelesToUse, final List likelihoodIndexesToUse, + final List sacIndexesToUse, final GenotypeAssignmentMethod assignGenotypes) { + + if ( originalGs == null ) throw new IllegalArgumentException("the original GenotypesContext cannot be null"); + if ( originalVC == null ) throw new IllegalArgumentException("the original VariantContext cannot be null"); + if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); + // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); // make sure we are seeing the expected number of likelihoods per sample - final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), 2); // the samples final List sampleIndices = originalGs.getSampleNamesOrderedByName(); @@ -703,7 +836,7 @@ public class GATKVariantContextUtils { final Genotype g = originalGs.get(sampleIndices.get(k)); final GenotypeBuilder gb = new GenotypeBuilder(g); - // create the new likelihoods array from the alleles we are allowed to use + // create the new likelihoods array from the used alleles double[] newLikelihoods; if ( !g.hasLikelihoods() ) { // we don't have any likelihoods, so we null out PLs and make G ./. @@ -714,7 +847,7 @@ public class GATKVariantContextUtils { if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { - logger.debug("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + logger.debug("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + originalVC + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); newLikelihoods = null; } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; @@ -732,11 +865,17 @@ public class GATKVariantContextUtils { gb.PL(newLikelihoods); } + // create the new strand allele counts array from the used alleles + if ( g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)){ + int[] newSACs = makeNewSACs(g, sacIndexesToUse); + gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, newSACs); + } + updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); newGTs.add(gb.make()); } - return fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); + return fixADFromSubsettedAlleles(newGTs, originalVC, allelesToUse); } private static boolean likelihoodsAreUninformative(final double[] likelihoods) { @@ -1165,20 +1304,23 @@ public class GATKVariantContextUtils { } /** - * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles + * Updates the PLs, SACs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles * from the original VariantContext are no longer present. * * @param selectedVC the selected (new) VariantContext * @param originalVC the original VariantContext * @return a new non-null GenotypesContext */ - public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { + public static GenotypesContext updatePLsSACsAD(final VariantContext selectedVC, final VariantContext originalVC) { + if ( selectedVC == null ) throw new IllegalArgumentException("the selected VariantContext cannot be null"); + if ( originalVC == null ) throw new IllegalArgumentException("the original VariantContext cannot be null"); + final int numNewAlleles = selectedVC.getAlleles().size(); final int numOriginalAlleles = originalVC.getAlleles().size(); // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong if ( numNewAlleles > numOriginalAlleles ) - throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); + throw new IllegalArgumentException("Attempting to fix PLs, SACs and AD from what appears to be a *combined* VCF and not a selected one"); final GenotypesContext oldGs = selectedVC.getGenotypes(); @@ -1186,24 +1328,31 @@ public class GATKVariantContextUtils { if ( numNewAlleles == numOriginalAlleles ) return oldGs; - return fixGenotypesFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); + return fixDiploidGenotypesFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); } /** - * Fix the PLs and ADs for the GenotypesContext of a VariantContext that has been subset + * Fix the PLs, SACs and ADs for the GenotypesContext of a VariantContext that has been subset * * @param originalGs the original GenotypesContext * @param originalVC the original VariantContext * @param allelesToUse the new (sub)set of alleles to use * @return a new non-null GenotypesContext */ - static private GenotypesContext fixGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + static private GenotypesContext fixDiploidGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); + if ( originalGs == null ) throw new IllegalArgumentException("the selected GenotypesContext cannot be null"); + if ( originalVC == null ) throw new IllegalArgumentException("the original VariantContext cannot be null"); + if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); + + // find the likelihoods indexes to use from the used alternate alleles + final List likelihoodIndexesToUse = determineDiploidLikelihoodIndexesToUse(originalVC, allelesToUse); + + // find the strand allele count indexes to use from the used alternate alleles + final List sacIndexesToUse = determineSACIndexesToUse(originalVC, allelesToUse); // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); + return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, sacIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); } /** diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index d8f76e9fc..3b23f7e51 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -1150,9 +1150,9 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); // make sure we don't screw up the simple case - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10, 2}).PL(hetPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10, 2}).PL(homVarPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).make())}); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).make())}); @@ -1298,9 +1298,11 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { builder.DP(10); builder.GQ(30); builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); - builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); + builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); + builder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, + alleles.size() == 1 ? new int[]{1, 2} : (alleles.size() == 2 ? new int[]{1, 2, 3, 4} : new int[]{1, 2, 3, 4, 5, 6})); final List refs = Collections.nCopies(alleles.size(), Aref); - tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); + tests.put(builder.make(), builder.alleles(refs).noAD().noPL().attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, null).make()); } } @@ -1311,7 +1313,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Assert.assertEquals(gc.size(), genotypes.size()); for ( int i = 0; i < genotypes.size(); i++ ) { -// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); } } @@ -1324,8 +1325,8 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // -------------------------------------------------------------------------------- - @DataProvider(name = "updatePLsAndADData") - public Object[][] makeUpdatePLsAndADData() { + @DataProvider(name = "updatePLsSACsAndADData") + public Object[][] makeUpdatePLsSACsAndADData() { List tests = new ArrayList<>(); final Allele A = Allele.create("A", true); @@ -1350,9 +1351,9 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); // make sure we don't screw up the simple case where no selection happens - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10, 2}).PL(hetPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10, 2}).PL(homVarPL).attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).GQ(8).make(); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); @@ -1379,44 +1380,59 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG + final int[] homRef3AllelesSAC = new int[]{20, 19, 0, 1, 3, 4}; + final int[] hetRefC3AllelesSAC = new int[]{10, 9, 10, 9, 1, 1}; + final int[] homC3AllelesSAC = new int[]{0, 0, 20, 20, 1, 1}; + final int[] hetRefG3AllelesSAC = new int[]{10, 10, 0, 0, 11, 11}; + final int[] hetCG3AllelesSAC = new int[]{0, 0, 12, 12, 11, 11}; // AA, AC, CC, AG, CG, GG + final int[] homG3AllelesSAC = new int[]{0, 0, 1, 1, 21, 21}; // AA, AC, CC, AG, CG, GG + tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homRef3AllelesSAC).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{20, 19, 0, 1}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, hetRefC3AllelesSAC).make()).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); - + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{10, 9, 10, 9}).GQ(100).make())}); tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homC3AllelesSAC).make()).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); - + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{0, 0, 20, 20}).GQ(100).make())}); tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, hetRefG3AllelesSAC).make()).make(), new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); - + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{10, 10, 11, 11}).GQ(100).make())}); tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, hetCG3AllelesSAC).make()).make(), new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); - + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{0, 0, 11, 11}).GQ(100).make())}); tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, homG3AllelesSAC).make()).make(), new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{0, 0, 21, 21}).GQ(100).make())}); return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "updatePLsAndADData") + @Test(dataProvider = "updatePLsSACsAndADData") public void testUpdatePLsAndADData(final VariantContext originalVC, final VariantContext selectedVC, final List expectedGenotypes) { final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); - final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); + final GenotypesContext actual = GATKVariantContextUtils.updatePLsSACsAD(selectedVCwithGTs, originalVC); Assert.assertEquals(actual.size(), expectedGenotypes.size()); for ( final Genotype expected : expectedGenotypes ) { @@ -1629,5 +1645,20 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // Throws an exception if the ref allele length <= ref allele length to extend Map map = GATKVariantContextUtils.createAlleleMapping(Aref, vc, alleles); } + + @Test + public void testDetermineSACIndexesToUse(){ + final VariantContext vc = makeVC("vc", Arrays.asList(Aref, T, C)); + Assert.assertEquals(GATKVariantContextUtils.determineSACIndexesToUse(vc, Arrays.asList(Aref, C)), Arrays.asList(0, 1, 4, 5)); + Assert.assertEquals(GATKVariantContextUtils.determineSACIndexesToUse(vc, Arrays.asList(G)), Arrays.asList(0, 1)); + } + + @Test + public void testMakeNewSACs(){ + int[] expected = {10, 20} ; + final Genotype g = new GenotypeBuilder().alleles(Arrays.asList(Allele.create("A", true), Allele.create("G"))). + attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, new int[]{5, 10, 15, 20}).make(); + Assert.assertEquals(GATKVariantContextUtils.makeNewSACs(g, Arrays.asList(1, 3)), expected); + } }