From 9c8f035780b7c3bf422590e0a69b66b63e9d9ab1 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 20 Oct 2015 13:46:03 -0400 Subject: [PATCH] LeftAlignAndTrimVariants --splitMultiallelics keeps GT if valid --- ...ftAlignAndTrimVariantsIntegrationTest.java | 31 ++- .../LeftAlignAndTrimVariants.java | 42 ++-- .../variant/GATKVariantContextUtils.java | 183 ++++++++++++++++-- 3 files changed, 230 insertions(+), 26 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java index 4b5d9979f..7ff27daf3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsIntegrationTest.java @@ -136,9 +136,8 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -split", 1, - Arrays.asList("534bea653d4a0e59e74f4107c1768558")); + Arrays.asList("1158324223c312e4767fcefe9dde2fe1")); executeTest("test left alignment with hard multiple alleles", spec); - } @Test @@ -146,8 +145,34 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header -split", 1, - Arrays.asList("189b8136ee62b54bf7b227e99c892440")); + Arrays.asList("923cb1d06e2d0d9a98cda8f8f637d108")); executeTest("test left alignment with hard multiple alleles, don't trim", spec); + } + @Test + public void testLeftAlignmentWithMultialleliecGenotypes() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "multiallele-gt.vcf --no_cmdline_in_header -split", + 1, + Arrays.asList("f7be485b0cc7b8db75b7139f31c0708d")); + executeTest("test left alignment of multiple alleles with genoptypes", spec); + } + + @Test + public void testLeftAlignmentWithMultialleliecGenotypesHetNoRef() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "multiallele-gt-het-noref.vcf --no_cmdline_in_header -split", + 1, + Arrays.asList("cd686641ab7fe491a0acc7ff07535192")); + executeTest("test left alignment of multiple alleles with genoptypes, including het-noref", spec); + } + + @Test + public void testLeftAlignmentWithMultialleliecGenotypesKeepOriginalAC() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "multiallele-gt.vcf --no_cmdline_in_header -split -keepOriginalAC", + 1, + Arrays.asList("5da4ca9705fbb63446c1d317f7b6cae0")); + executeTest("test left alignment of multiple alleles with genoptypes, keep original AC", spec); } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index 0ba7e1013..49aa15477 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -44,6 +44,9 @@ import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; @@ -118,7 +121,7 @@ import java.util.*; * --splitMultiallelics * * - *

Split multiallelics into biallics, left align but don't trim alleles

+ *

Split multiallelics into biallics, left align but don't trim alleles, and store the original AC, AF, and AN values

*
  * java -jar GenomeAnalysisTK.jar \
  *   -T LeftAlignAndTrimVariants \
@@ -127,6 +130,7 @@ import java.util.*;
  *   -o output.vcf \
  *   --splitMultiallelics \
  *   --dontTrimAlleles
+ *   --keepOriginalAC
  * 
* */ @@ -153,6 +157,14 @@ public class LeftAlignAndTrimVariants extends RodWalker { @Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false) protected boolean splitMultiallelics = false; + /** + * When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the + * subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called + * AC_Orig, AF_Orig, and AN_Orig. + */ + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false) + private boolean keepOriginalChrCounts = false; + @Output(doc="File to which variants should be written") protected VariantContextWriter baseWriter = null; @@ -165,11 +177,21 @@ public class LeftAlignAndTrimVariants extends RodWalker { private int referenceWindowStop; public void initialize() { - String trackName = variantCollection.variants.getName(); - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); - Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + final String trackName = variantCollection.variants.getName(); + final Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + final Map vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + + final Set headerLines = new HashSet<>(); + headerLines.addAll(vcfHeaders.get(trackName).getMetaDataInSortedOrder()) ; + if ( splitMultiallelics ) + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + + if (keepOriginalChrCounts) { + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY)); + } - Set headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder(); baseWriter.writeHeader(new VCFHeader(headerLines, samples)); writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, MAX_INDEL_LENGTH); @@ -187,7 +209,8 @@ public class LeftAlignAndTrimVariants extends RodWalker { for ( final VariantContext vc : VCs ) { // split first into biallelics, and optionally don't trim alleles to minimal representation if (splitMultiallelics) { - final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc, false, + GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, keepOriginalChrCounts); for (final VariantContext biallelicVC: vcList) { changedSites += trimAlignWrite(biallelicVC, ref, vcList.size()); } @@ -237,10 +260,6 @@ public class LeftAlignAndTrimVariants extends RodWalker { // align the VC final Pair result = alignAndWrite(v, ref); - // strip out PLs and AD if we've subsetted the alleles - if ( numBiallelics > 1 ) - result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make(); - // write out new VC writer.add(result.first); @@ -301,7 +320,6 @@ public class LeftAlignAndTrimVariants extends RodWalker { if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) { int difference = originalIndex - newCigar.getCigarElement(0).getLength(); VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make(); - //System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference)); final int indelIndex = originalIndex-difference; final byte[] newBases = new byte[indelLength + 1]; @@ -323,7 +341,7 @@ public class LeftAlignAndTrimVariants extends RodWalker { * @param ref Ref bases * @param indexOfRef Index in ref where to create indel * @param indelLength Indel length - * @return + * @return Haplotype, containing the reference and the indel */ @Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"}) @Ensures("result != null") diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 520582a47..db964663a 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -31,6 +31,8 @@ import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.popgen.HardyWeinbergCalculation; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.*; @@ -558,6 +560,12 @@ public class GATKVariantContextUtils { */ SET_TO_NO_CALL, + /** + * set all of the genotype GT values to NO_CALL and remove annotations + */ + SET_TO_NO_CALL_NO_ANNOTATIONS, + + /** * Use the subsetted PLs to greedily assigned genotypes */ @@ -589,7 +597,7 @@ public class GATKVariantContextUtils { } /** - * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * Subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * * @param vc variant context with genotype likelihoods * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** @@ -601,6 +609,7 @@ public class GATKVariantContextUtils { final GenotypeAssignmentMethod assignGenotypes) { if ( vc == null ) throw new IllegalArgumentException("the VariantContext cannot be null"); if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null"); + if ( allelesToUse.isEmpty() ) throw new IllegalArgumentException("must have alleles to use"); if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); @@ -896,6 +905,10 @@ public class GATKVariantContextUtils { final GenotypeAssignmentMethod assignmentMethod, final double[] newLikelihoods, final List allelesToUse) { + if ( originalGT == null ) throw new IllegalArgumentException("originalGT cannot be null"); + if ( gb == null ) throw new IllegalArgumentException("gb cannot be null"); + if ( allelesToUse.isEmpty() || allelesToUse == null ) throw new IllegalArgumentException("allelesToUse cannot be empty or null"); + switch ( assignmentMethod ) { case DO_NOT_ASSIGN_GENOTYPES: break; @@ -903,6 +916,13 @@ public class GATKVariantContextUtils { gb.alleles(NO_CALL_ALLELES); gb.noGQ(); break; + case SET_TO_NO_CALL_NO_ANNOTATIONS: + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + gb.noAD(); + gb.noPL(); + gb.noAttributes(); + break; case USE_PLS_TO_ASSIGN: if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { // if there is no mass on the (new) likelihoods, then just no-call the sample @@ -918,12 +938,10 @@ public class GATKVariantContextUtils { break; case BEST_MATCH_TO_ORIGINAL: final List best = new LinkedList<>(); - final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument + final Allele ref = allelesToUse.get(0); for ( final Allele originalAllele : originalGT ) { best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); } - gb.noGQ(); - gb.noPL(); gb.alleles(best); break; } @@ -970,7 +988,7 @@ public class GATKVariantContextUtils { /** * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs * - * @param vc variant context with genotype likelihoods + * @param vc variant context with genotype likelihoods * @return genotypes context */ public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { @@ -999,7 +1017,6 @@ public class GATKVariantContextUtils { * Split variant context into its biallelic components if there are more than 2 alleles * * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) * Alleles are right trimmed to satisfy VCF conventions * * If vc is biallelic or non-variant it is just returned @@ -1008,21 +1025,63 @@ public class GATKVariantContextUtils { * * @param vc a potentially multi-allelic variant context * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome + * @param genotypeAssignmentMethod assignment strategy for the (subsetted) PLs * @return a list of bi-allelic (or monomorphic) variant context */ - public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { + public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod){ + return splitVariantContextToBiallelics(vc, trimLeft,genotypeAssignmentMethod, false); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome + * @param genotypeAssignmentMethod assignment strategy for the (subsetted) PLs + * @param keepOriginalChrCounts keep the orignal chromosome counts before subsetting + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod, + final boolean keepOriginalChrCounts) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + if ( ! vc.isVariant() || vc.isBiallelic() ) // non variant or biallelics already satisfy the contract return Collections.singletonList(vc); else { final List biallelics = new LinkedList<>(); + // if any of the genotypes ar ehet-not-ref (i.e. 1/2), set all of them to no-call + final GenotypeAssignmentMethod genotypeAssignmentMethodUsed = hasHetNonRef(vc.getGenotypes()) ? GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS : genotypeAssignmentMethod; + for ( final Allele alt : vc.getAlternateAlleles() ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); + final VariantContextBuilder builder = new VariantContextBuilder(vc); + + // make biallelic alleles final List alleles = Arrays.asList(vc.getReference(), alt); builder.alleles(alleles); - builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); - VariantContextUtils.calculateChromosomeCounts(builder, true); + + // since the VC has been subset, remove the invalid attributes + for ( final String key : vc.getAttributes().keySet() ) { + if ( !(key.equals(VCFConstants.ALLELE_COUNT_KEY) || key.equals(VCFConstants.ALLELE_FREQUENCY_KEY) || key.equals(VCFConstants.ALLELE_NUMBER_KEY)) || + genotypeAssignmentMethodUsed == GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS ) { + builder.rmAttribute(key); + } + } + + // subset INFO field annotations if available if genotype is called + if ( genotypeAssignmentMethodUsed != GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS && + genotypeAssignmentMethodUsed != GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL ) + addInfoFiledAnnotations(vc, builder, alt, keepOriginalChrCounts); + + builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethodUsed)); final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); biallelics.add(trimmed); } @@ -1031,6 +1090,21 @@ public class GATKVariantContextUtils { } } + /** + * Check if any of the genotypes is heterozygous, non-reference (i.e. 1/2) + * + * @param genotypesContext genotype information + * @return true if any of the genotypes are heterozygous, non-reference, false otherwise + */ + private static boolean hasHetNonRef(final GenotypesContext genotypesContext ){ + for (final Genotype gt : genotypesContext ) { + if (gt.isHetNonRef()) + return true; + } + return false; + } + + public static Genotype removePLsAndAD(final Genotype g) { return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } @@ -1336,7 +1410,7 @@ public class GATKVariantContextUtils { * * @param originalGs the original GenotypesContext * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use + * @param allelesToUse the new subset of alleles to use * @return a new non-null GenotypesContext */ static private GenotypesContext fixDiploidGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { @@ -2121,4 +2195,91 @@ public class GATKVariantContextUtils { return -1; } + + /** + * Add the VCF INFO field annotations for the used alleles + * + * @param vc original variant context + * @param builder variant context builder with subset of original variant context's alleles + * @param altAllele alternate allele + * @param keepOriginalChrCounts keep the orignal chromosome counts before subsetting + * @return variant context builder with updated INFO field attribute values + */ + private static void addInfoFiledAnnotations(final VariantContext vc, final VariantContextBuilder builder, final Allele altAllele, + final boolean keepOriginalChrCounts){ + + if (vc == null) throw new IllegalArgumentException("the variant context cannot be null"); + if (builder == null) throw new IllegalArgumentException("the variant context builder cannot be null"); + if (builder.getAlleles() == null) throw new IllegalArgumentException("the variant context builder alleles cannot be null"); + + final List alleles = builder.getAlleles(); + if (alleles.size() < 2) throw new IllegalArgumentException("the variant context builder must contain at least 2 alleles"); + + // don't have to subset, the original vc has the same number and hence, the same alleles + boolean keepOriginal = ( vc.getAlleles().size() == builder.getAlleles().size() ); + + if ( keepOriginalChrCounts ) { + if (vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) + builder.attribute(GATKVCFConstants.ORIGINAL_AC_KEY, keepOriginal ? + vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY) : getAltAlleleInfoFieldValue(VCFConstants.ALLELE_COUNT_KEY, vc, altAllele)); + if (vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) + builder.attribute(GATKVCFConstants.ORIGINAL_AF_KEY, keepOriginal ? + vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) : getAltAlleleInfoFieldValue(VCFConstants.ALLELE_FREQUENCY_KEY, vc, altAllele)); + if (vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) { + builder.attribute(GATKVCFConstants.ORIGINAL_AN_KEY, vc.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); + } + } + + VariantContextUtils.calculateChromosomeCounts(builder, true); + } + + + /** + * Get the alternate allele INFO field value + * + * @param infoFieldName VCF INFO field name + * @param vc variant context + * @param altAllele the alternate allele + * @return alternate allele INFO field value + * @throws ReviewedGATKException if the alternate allele is part of the variant context + */ + private static Object getAltAlleleInfoFieldValue(final String infoFieldName, final VariantContext vc, final Allele altAllele) { + + //final String[] splitOriginalField = vc.getAttribute(infoFieldName).toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR); + final Object[] splitOriginalField = getVAttributeValues(vc.getAttribute(infoFieldName)); + + // subset the field + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(vc, Arrays.asList(altAllele)); + + // skip the first allele, which is the reference + for (int i = 1; i < alleleIndexesToUse.length; i++) { + if (alleleIndexesToUse[i] == true) + return splitOriginalField[i-1]; + } + + throw new ReviewedGATKException("Alternate allele " + altAllele.toString() + " not in Variant Context " + vc.toString()); + } + + /** + * Pulls out the appropriate values for the INFO field attribute + * + * @param attribute INFO field attribute + * @return tokenized attribute values + */ + private static Object[] getVAttributeValues(final Object attribute) { + + if (attribute == null) throw new IllegalArgumentException("the attribute cannot be null"); + + // break the original attributes into separate tokens + final Object[] tokens; + if ( attribute.getClass().isArray() ) + tokens = (Object[])attribute; + else if ( List.class.isAssignableFrom(attribute.getClass()) ) + tokens = ((List)attribute).toArray(); + else + tokens = attribute.toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR); + + return tokens; + } } +