LeftAlignAndTrimVariants --splitMultiallelics keeps GT if valid

This commit is contained in:
Ron Levine 2015-10-20 13:46:03 -04:00
parent 4767a83d8a
commit 9c8f035780
3 changed files with 230 additions and 26 deletions

View File

@ -136,9 +136,8 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --no_cmdline_in_header -split",
1,
Arrays.asList("534bea653d4a0e59e74f4107c1768558"));
Arrays.asList("1158324223c312e4767fcefe9dde2fe1"));
executeTest("test left alignment with hard multiple alleles", spec);
}
@Test
@ -146,8 +145,34 @@ public class LeftAlignAndTrimVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "forHardLeftAlignVariantsTest.vcf --dontTrimAlleles --no_cmdline_in_header -split",
1,
Arrays.asList("189b8136ee62b54bf7b227e99c892440"));
Arrays.asList("923cb1d06e2d0d9a98cda8f8f637d108"));
executeTest("test left alignment with hard multiple alleles, don't trim", spec);
}
@Test
public void testLeftAlignmentWithMultialleliecGenotypes() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "multiallele-gt.vcf --no_cmdline_in_header -split",
1,
Arrays.asList("f7be485b0cc7b8db75b7139f31c0708d"));
executeTest("test left alignment of multiple alleles with genoptypes", spec);
}
@Test
public void testLeftAlignmentWithMultialleliecGenotypesHetNoRef() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "multiallele-gt-het-noref.vcf --no_cmdline_in_header -split",
1,
Arrays.asList("cd686641ab7fe491a0acc7ff07535192"));
executeTest("test left alignment of multiple alleles with genoptypes, including het-noref", spec);
}
@Test
public void testLeftAlignmentWithMultialleliecGenotypesKeepOriginalAC() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignAndTrimVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "multiallele-gt.vcf --no_cmdline_in_header -split -keepOriginalAC",
1,
Arrays.asList("5da4ca9705fbb63446c1d317f7b6cae0"));
executeTest("test left alignment of multiple alleles with genoptypes, keep original AC", spec);
}
}

View File

@ -44,6 +44,9 @@ import org.broadinstitute.gatk.engine.walkers.Window;
import org.broadinstitute.gatk.engine.SampleUtils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
@ -118,7 +121,7 @@ import java.util.*;
* --splitMultiallelics
* </pre>
*
* <h4>Split multiallelics into biallics, left align but don't trim alleles</h4>
* <h4>Split multiallelics into biallics, left align but don't trim alleles, and store the original AC, AF, and AN values</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T LeftAlignAndTrimVariants \
@ -127,6 +130,7 @@ import java.util.*;
* -o output.vcf \
* --splitMultiallelics \
* --dontTrimAlleles
* --keepOriginalAC
* </pre>
*
*/
@ -153,6 +157,14 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="splitMultiallelics", shortName="split", doc="Split multiallelic records and left-align individual alleles", required=false)
protected boolean splitMultiallelics = false;
/**
* When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the
* subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called
* AC_Orig, AF_Orig, and AN_Orig.
*/
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values after subsetting", required=false)
private boolean keepOriginalChrCounts = false;
@Output(doc="File to which variants should be written")
protected VariantContextWriter baseWriter = null;
@ -165,11 +177,21 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
private int referenceWindowStop;
public void initialize() {
String trackName = variantCollection.variants.getName();
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
Map<String, VCFHeader> vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
final String trackName = variantCollection.variants.getName();
final Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName));
final Map<String, VCFHeader> vcfHeaders = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
final Set<VCFHeaderLine> headerLines = new HashSet<>();
headerLines.addAll(vcfHeaders.get(trackName).getMetaDataInSortedOrder()) ;
if ( splitMultiallelics )
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
if (keepOriginalChrCounts) {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY));
}
Set<VCFHeaderLine> headerLines = vcfHeaders.get(trackName).getMetaDataInSortedOrder();
baseWriter.writeHeader(new VCFHeader(headerLines, samples));
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, MAX_INDEL_LENGTH);
@ -187,7 +209,8 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
for ( final VariantContext vc : VCs ) {
// split first into biallelics, and optionally don't trim alleles to minimal representation
if (splitMultiallelics) {
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc, false,
GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, keepOriginalChrCounts);
for (final VariantContext biallelicVC: vcList) {
changedSites += trimAlignWrite(biallelicVC, ref, vcList.size());
}
@ -237,10 +260,6 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
// align the VC
final Pair<VariantContext,Integer> result = alignAndWrite(v, ref);
// strip out PLs and AD if we've subsetted the alleles
if ( numBiallelics > 1 )
result.first = new VariantContextBuilder(result.first).genotypes(GATKVariantContextUtils.stripPLsAndAD(result.first.getGenotypes())).make();
// write out new VC
writer.add(result.first);
@ -301,7 +320,6 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
if ( !newCigar.equals(originalCigar) && newCigar.numCigarElements() > 1 ) {
int difference = originalIndex - newCigar.getCigarElement(0).getLength();
VariantContext newVC = new VariantContextBuilder(vc).start(vc.getStart()-difference).stop(vc.getEnd()-difference).make();
//System.out.println("Moving record from " + vc.getChr()+":"+vc.getStart() + " to " + vc.getChr()+":"+(vc.getStart()-difference));
final int indelIndex = originalIndex-difference;
final byte[] newBases = new byte[indelLength + 1];
@ -323,7 +341,7 @@ public class LeftAlignAndTrimVariants extends RodWalker<Integer, Integer> {
* @param ref Ref bases
* @param indexOfRef Index in ref where to create indel
* @param indelLength Indel length
* @return
* @return Haplotype, containing the reference and the indel
*/
@Requires({"vc != null","ref != null", "indexOfRef +indelLength < ref.length", "vc.getNAlleles() == 2"})
@Ensures("result != null")

View File

@ -31,6 +31,8 @@ import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.popgen.HardyWeinbergCalculation;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.*;
@ -558,6 +560,12 @@ public class GATKVariantContextUtils {
*/
SET_TO_NO_CALL,
/**
* set all of the genotype GT values to NO_CALL and remove annotations
*/
SET_TO_NO_CALL_NO_ANNOTATIONS,
/**
* Use the subsetted PLs to greedily assigned genotypes
*/
@ -589,7 +597,7 @@ public class GATKVariantContextUtils {
}
/**
* subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
* Subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
*
* @param vc variant context with genotype likelihoods
* @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC ***
@ -601,6 +609,7 @@ public class GATKVariantContextUtils {
final GenotypeAssignmentMethod assignGenotypes) {
if ( vc == null ) throw new IllegalArgumentException("the VariantContext cannot be null");
if ( allelesToUse == null ) throw new IllegalArgumentException("the alleles to use cannot be null");
if ( allelesToUse.isEmpty() ) throw new IllegalArgumentException("must have alleles to use");
if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele");
if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele");
@ -896,6 +905,10 @@ public class GATKVariantContextUtils {
final GenotypeAssignmentMethod assignmentMethod,
final double[] newLikelihoods,
final List<Allele> allelesToUse) {
if ( originalGT == null ) throw new IllegalArgumentException("originalGT cannot be null");
if ( gb == null ) throw new IllegalArgumentException("gb cannot be null");
if ( allelesToUse.isEmpty() || allelesToUse == null ) throw new IllegalArgumentException("allelesToUse cannot be empty or null");
switch ( assignmentMethod ) {
case DO_NOT_ASSIGN_GENOTYPES:
break;
@ -903,6 +916,13 @@ public class GATKVariantContextUtils {
gb.alleles(NO_CALL_ALLELES);
gb.noGQ();
break;
case SET_TO_NO_CALL_NO_ANNOTATIONS:
gb.alleles(NO_CALL_ALLELES);
gb.noGQ();
gb.noAD();
gb.noPL();
gb.noAttributes();
break;
case USE_PLS_TO_ASSIGN:
if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) {
// if there is no mass on the (new) likelihoods, then just no-call the sample
@ -918,12 +938,10 @@ public class GATKVariantContextUtils {
break;
case BEST_MATCH_TO_ORIGINAL:
final List<Allele> best = new LinkedList<>();
final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument
final Allele ref = allelesToUse.get(0);
for ( final Allele originalAllele : originalGT ) {
best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref);
}
gb.noGQ();
gb.noPL();
gb.alleles(best);
break;
}
@ -970,7 +988,7 @@ public class GATKVariantContextUtils {
/**
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
*
* @param vc variant context with genotype likelihoods
* @param vc variant context with genotype likelihoods
* @return genotypes context
*/
public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) {
@ -999,7 +1017,6 @@ public class GATKVariantContextUtils {
* Split variant context into its biallelic components if there are more than 2 alleles
*
* For VC has A/B/C alleles, returns A/B and A/C contexts.
* Genotypes are all no-calls now (it's not possible to fix them easily)
* Alleles are right trimmed to satisfy VCF conventions
*
* If vc is biallelic or non-variant it is just returned
@ -1008,21 +1025,63 @@ public class GATKVariantContextUtils {
*
* @param vc a potentially multi-allelic variant context
* @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome
* @param genotypeAssignmentMethod assignment strategy for the (subsetted) PLs
* @return a list of bi-allelic (or monomorphic) variant context
*/
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) {
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod){
return splitVariantContextToBiallelics(vc, trimLeft,genotypeAssignmentMethod, false);
}
/**
* Split variant context into its biallelic components if there are more than 2 alleles
*
* For VC has A/B/C alleles, returns A/B and A/C contexts.
* Alleles are right trimmed to satisfy VCF conventions
*
* If vc is biallelic or non-variant it is just returned
*
* Chromosome counts are updated (but they are by definition 0)
*
* @param vc a potentially multi-allelic variant context
* @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome
* @param genotypeAssignmentMethod assignment strategy for the (subsetted) PLs
* @param keepOriginalChrCounts keep the orignal chromosome counts before subsetting
* @return a list of bi-allelic (or monomorphic) variant context
*/
public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod,
final boolean keepOriginalChrCounts) {
if ( vc == null ) throw new IllegalArgumentException("vc cannot be null");
if ( ! vc.isVariant() || vc.isBiallelic() )
// non variant or biallelics already satisfy the contract
return Collections.singletonList(vc);
else {
final List<VariantContext> biallelics = new LinkedList<>();
// if any of the genotypes ar ehet-not-ref (i.e. 1/2), set all of them to no-call
final GenotypeAssignmentMethod genotypeAssignmentMethodUsed = hasHetNonRef(vc.getGenotypes()) ? GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS : genotypeAssignmentMethod;
for ( final Allele alt : vc.getAlternateAlleles() ) {
VariantContextBuilder builder = new VariantContextBuilder(vc);
final VariantContextBuilder builder = new VariantContextBuilder(vc);
// make biallelic alleles
final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
builder.alleles(alleles);
builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod));
VariantContextUtils.calculateChromosomeCounts(builder, true);
// since the VC has been subset, remove the invalid attributes
for ( final String key : vc.getAttributes().keySet() ) {
if ( !(key.equals(VCFConstants.ALLELE_COUNT_KEY) || key.equals(VCFConstants.ALLELE_FREQUENCY_KEY) || key.equals(VCFConstants.ALLELE_NUMBER_KEY)) ||
genotypeAssignmentMethodUsed == GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS ) {
builder.rmAttribute(key);
}
}
// subset INFO field annotations if available if genotype is called
if ( genotypeAssignmentMethodUsed != GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS &&
genotypeAssignmentMethodUsed != GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL )
addInfoFiledAnnotations(vc, builder, alt, keepOriginalChrCounts);
builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethodUsed));
final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true);
biallelics.add(trimmed);
}
@ -1031,6 +1090,21 @@ public class GATKVariantContextUtils {
}
}
/**
* Check if any of the genotypes is heterozygous, non-reference (i.e. 1/2)
*
* @param genotypesContext genotype information
* @return true if any of the genotypes are heterozygous, non-reference, false otherwise
*/
private static boolean hasHetNonRef(final GenotypesContext genotypesContext ){
for (final Genotype gt : genotypesContext ) {
if (gt.isHetNonRef())
return true;
}
return false;
}
public static Genotype removePLsAndAD(final Genotype g) {
return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
}
@ -1336,7 +1410,7 @@ public class GATKVariantContextUtils {
*
* @param originalGs the original GenotypesContext
* @param originalVC the original VariantContext
* @param allelesToUse the new (sub)set of alleles to use
* @param allelesToUse the new subset of alleles to use
* @return a new non-null GenotypesContext
*/
static private GenotypesContext fixDiploidGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List<Allele> allelesToUse) {
@ -2121,4 +2195,91 @@ public class GATKVariantContextUtils {
return -1;
}
/**
* Add the VCF INFO field annotations for the used alleles
*
* @param vc original variant context
* @param builder variant context builder with subset of original variant context's alleles
* @param altAllele alternate allele
* @param keepOriginalChrCounts keep the orignal chromosome counts before subsetting
* @return variant context builder with updated INFO field attribute values
*/
private static void addInfoFiledAnnotations(final VariantContext vc, final VariantContextBuilder builder, final Allele altAllele,
final boolean keepOriginalChrCounts){
if (vc == null) throw new IllegalArgumentException("the variant context cannot be null");
if (builder == null) throw new IllegalArgumentException("the variant context builder cannot be null");
if (builder.getAlleles() == null) throw new IllegalArgumentException("the variant context builder alleles cannot be null");
final List<Allele> alleles = builder.getAlleles();
if (alleles.size() < 2) throw new IllegalArgumentException("the variant context builder must contain at least 2 alleles");
// don't have to subset, the original vc has the same number and hence, the same alleles
boolean keepOriginal = ( vc.getAlleles().size() == builder.getAlleles().size() );
if ( keepOriginalChrCounts ) {
if (vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY))
builder.attribute(GATKVCFConstants.ORIGINAL_AC_KEY, keepOriginal ?
vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY) : getAltAlleleInfoFieldValue(VCFConstants.ALLELE_COUNT_KEY, vc, altAllele));
if (vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY))
builder.attribute(GATKVCFConstants.ORIGINAL_AF_KEY, keepOriginal ?
vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) : getAltAlleleInfoFieldValue(VCFConstants.ALLELE_FREQUENCY_KEY, vc, altAllele));
if (vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) {
builder.attribute(GATKVCFConstants.ORIGINAL_AN_KEY, vc.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));
}
}
VariantContextUtils.calculateChromosomeCounts(builder, true);
}
/**
* Get the alternate allele INFO field value
*
* @param infoFieldName VCF INFO field name
* @param vc variant context
* @param altAllele the alternate allele
* @return alternate allele INFO field value
* @throws ReviewedGATKException if the alternate allele is part of the variant context
*/
private static Object getAltAlleleInfoFieldValue(final String infoFieldName, final VariantContext vc, final Allele altAllele) {
//final String[] splitOriginalField = vc.getAttribute(infoFieldName).toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
final Object[] splitOriginalField = getVAttributeValues(vc.getAttribute(infoFieldName));
// subset the field
final boolean[] alleleIndexesToUse = getAlleleIndexBitset(vc, Arrays.asList(altAllele));
// skip the first allele, which is the reference
for (int i = 1; i < alleleIndexesToUse.length; i++) {
if (alleleIndexesToUse[i] == true)
return splitOriginalField[i-1];
}
throw new ReviewedGATKException("Alternate allele " + altAllele.toString() + " not in Variant Context " + vc.toString());
}
/**
* Pulls out the appropriate values for the INFO field attribute
*
* @param attribute INFO field attribute
* @return tokenized attribute values
*/
private static Object[] getVAttributeValues(final Object attribute) {
if (attribute == null) throw new IllegalArgumentException("the attribute cannot be null");
// break the original attributes into separate tokens
final Object[] tokens;
if ( attribute.getClass().isArray() )
tokens = (Object[])attribute;
else if ( List.class.isAssignableFrom(attribute.getClass()) )
tokens = ((List)attribute).toArray();
else
tokens = attribute.toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
return tokens;
}
}