/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.variant.variantcontext; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.apache.commons.lang.ArrayUtils; import org.broad.tribble.TribbleException; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.utils.GeneralUtils; import org.broadinstitute.variant.utils.Pair; import org.broadinstitute.variant.vcf.*; import java.io.Serializable; import java.util.*; public class VariantContextUtils { public final static String MERGE_INTERSECTION = "Intersection"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; public static final int DEFAULT_PLOIDY = 2; public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final public static JexlEngine engine = new JexlEngine(); private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false; static { engine.setSilent(false); // will throw errors now for selects that don't evaluate properly engine.setLenient(false); engine.setDebug(false); } /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags * * @param vc the VariantContext * @param attributes the attributes map to populate; must not be null; may contain old values * @param removeStaleValues should we remove stale values from the mapping? * @return the attributes map provided as input, returned for programming convenience */ public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues) { return calculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet(0)); } /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags * * @param vc the VariantContext * @param attributes the attributes map to populate; must not be null; may contain old values * @param removeStaleValues should we remove stale values from the mapping? * @param founderIds - Set of founders Ids to take into account. AF and FC will be calculated over the founders. * If empty or null, counts are generated for all samples as unrelated individuals * @return the attributes map provided as input, returned for programming convenience */ public static Map calculateChromosomeCounts(VariantContext vc, Map attributes, boolean removeStaleValues, final Set founderIds) { final int AN = vc.getCalledChrCount(); // if everyone is a no-call, remove the old attributes if requested if ( AN == 0 && removeStaleValues ) { if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) attributes.remove(VCFConstants.ALLELE_COUNT_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) ) attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) ) attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); return attributes; } if ( vc.hasGenotypes() ) { attributes.put(VCFConstants.ALLELE_NUMBER_KEY, AN); // if there are alternate alleles, record the relevant tags if ( vc.getAlternateAlleles().size() > 0 ) { ArrayList alleleFreqs = new ArrayList(); ArrayList alleleCounts = new ArrayList(); ArrayList foundersAlleleCounts = new ArrayList(); double totalFoundersChromosomes = (double)vc.getCalledChrCount(founderIds); int foundersAltChromosomes; for ( Allele allele : vc.getAlternateAlleles() ) { foundersAltChromosomes = vc.getCalledChrCount(allele,founderIds); alleleCounts.add(vc.getCalledChrCount(allele)); foundersAlleleCounts.add(foundersAltChromosomes); if ( AN == 0 ) { alleleFreqs.add(0.0); } else { final Double freq = (double)foundersAltChromosomes / totalFoundersChromosomes; alleleFreqs.add(freq); } } attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); } else { // if there's no alt AC and AF shouldn't be present attributes.remove(VCFConstants.ALLELE_COUNT_KEY); attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); } } return attributes; } /** * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper * chromosome-based VCF tags based on the current VC produced by builder.make() * * @param builder the VariantContextBuilder we are updating * @param removeStaleValues should we remove stale values from the mapping? */ public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) { VariantContext vc = builder.make(); builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, new HashSet(0))); } /** * Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper * chromosome-based VCF tags based on the current VC produced by builder.make() * * @param builder the VariantContextBuilder we are updating * @param founderIds - Set of founders to take into account. AF and FC will be calculated over the founders only. * If empty or null, counts are generated for all samples as unrelated individuals * @param removeStaleValues should we remove stale values from the mapping? */ public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues, final Set founderIds) { VariantContext vc = builder.make(); builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); } public static Genotype removePLsAndAD(final Genotype g) { return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field); if ( metaData == null ) metaData = header.getInfoHeaderLine(field); if ( metaData == null ) { if ( ASSUME_MISSING_FIELDS_ARE_STRINGS ) { if ( ! MISSING_KEYS_WARNED_ABOUT.contains(field) ) { MISSING_KEYS_WARNED_ABOUT.add(field); if ( GeneralUtils.DEBUG_MODE_ENABLED ) System.err.println("Field " + field + " missing from VCF header, assuming it is an unbounded string type"); } return new VCFInfoHeaderLine(field, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Auto-generated string header for " + field); } else throw new TribbleException("Fully decoding VariantContext requires header line for all fields, but none was found for " + field); } return metaData; } /** * Returns true iff VC is an non-complex indel where every allele represents an expansion or * contraction of a series of identical bases in the reference. * * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT * * If VC = -/CT, then this function returns true because the CT insertion matches exactly the * upcoming reference. * If VC = -/CTA then this function returns false because the CTA isn't a perfect match * * Now consider deletions: * * If VC = CT/- then again the same logic applies and this returns true * The case of CTA/- makes no sense because it doesn't actually match the reference bases. * * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For * each insertion allele of n bases, check if that allele matches the next n reference bases. * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, * as it must necessarily match the first n bases. If this test returns true for all * alleles you are a tandem repeat, otherwise you are not. * * @param vc * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference * @return */ @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); if ( ! vc.isIndel() ) // only indels are tandem repeats return false; final Allele ref = vc.getReference(); for ( final Allele allele : vc.getAlternateAlleles() ) { if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) return false; } // we've passed all of the tests, so we are a repeat return true; } /** * * @param vc * @param refBasesStartingAtVCWithPad * @return */ @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { final boolean VERBOSE = false; final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); if ( ! vc.isIndel() ) // only indels are tandem repeats return null; final Allele refAllele = vc.getReference(); final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); byte[] repeatUnit = null; final ArrayList lengths = new ArrayList(); for ( final Allele allele : vc.getAlternateAlleles() ) { Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); final int[] repetitionCount = result.first; // repetition count = 0 means allele is not a tandem expansion of context if (repetitionCount[0] == 0 || repetitionCount[1] == 0) return null; if (lengths.size() == 0) { lengths.add(repetitionCount[0]); // add ref allele length only once } lengths.add(repetitionCount[1]); // add this alt allele's length repeatUnit = result.second; if (VERBOSE) { System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); System.out.println("RU:"+new String(repeatUnit)); } } return new Pair, byte[]>(lengths,repeatUnit); } public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. */ byte[] longB; // find first repeat unit based on either ref or alt, whichever is longer if (altBases.length > refBases.length) longB = altBases; else longB = refBases; // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units // for example, -*,CACA needs to first be decomposed into (CA)2 final int repeatUnitLength = findRepeatedSubstring(longB); final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); final int[] repetitionCount = new int[2]; // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) int repetitionsInRef = findNumberofRepetitions(repeatUnit,refBases, true); repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; return new Pair(repetitionCount, repeatUnit); } /** * Find out if a string can be represented as a tandem number of substrings. * For example ACTACT is a 2-tandem of ACT, * but ACTACA is not. * * @param bases String to be tested * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't * be represented as one, it will be just the length of the input string) */ public static int findRepeatedSubstring(byte[] bases) { int repLength; for (repLength=1; repLength <=bases.length; repLength++) { final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); boolean allBasesMatch = true; for (int start = repLength; start < bases.length; start += repLength ) { // check that remaining of string is exactly equal to repeat unit final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); if (!Arrays.equals(candidateRepeatUnit, basePiece)) { allBasesMatch = false; break; } } if (allBasesMatch) return repLength; } return repLength; } /** * Helper routine that finds number of repetitions a string consists of. * For example, for string ATAT and repeat unit AT, number of repetitions = 2 * @param repeatUnit Substring * @param testString String to test * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's */ public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { int numRepeats = 0; if (lookForward) { // look forward on the test string for (int start = 0; start < testString.length; start += repeatUnit.length) { int end = start + repeatUnit.length; byte[] unit = Arrays.copyOfRange(testString,start, end); if(Arrays.equals(unit,repeatUnit)) numRepeats++; else break; } return numRepeats; } // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 // look forward on the test string for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { int end = start + repeatUnit.length; byte[] unit = Arrays.copyOfRange(testString,start, end); if(Arrays.equals(unit,repeatUnit)) numRepeats++; else break; } return numRepeats; } /** * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference * @param ref * @param alt * @param refBasesStartingAtVCWithoutPad * @return */ protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) return false; // we require one allele be a prefix of another if ( ref.length() > alt.length() ) { // we are a deletion return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); } else { // we are an insertion return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); } } protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { final String potentialRepeat = l.substring(s.length()); // skip s bases for ( int i = 0; i < minNumberOfMatches; i++) { final int start = i * potentialRepeat.length(); final int end = (i+1) * potentialRepeat.length(); if ( ref.length() < end ) return false; // we ran out of bases to test final String refSub = ref.substring(start, end); if ( ! refSub.equals(potentialRepeat) ) return false; // repeat didn't match, fail } return true; // we passed all tests, we matched } /** * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs * * @param vc variant context with genotype likelihoods * @return genotypes context */ public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { return subsetDiploidAlleles(vc, vc.getAlleles(), true); } /** * Split variant context into its biallelic components if there are more than 2 alleles * * For VC has A/B/C alleles, returns A/B and A/C contexts. * Genotypes are all no-calls now (it's not possible to fix them easily) * Alleles are right trimmed to satisfy VCF conventions * * If vc is biallelic or non-variant it is just returned * * Chromosome counts are updated (but they are by definition 0) * * @param vc a potentially multi-allelic variant context * @return a list of bi-allelic (or monomorphic) variant context */ public static List splitVariantContextToBiallelics(final VariantContext vc) { if ( ! vc.isVariant() || vc.isBiallelic() ) // non variant or biallelics already satisfy the contract return Collections.singletonList(vc); else { final List biallelics = new LinkedList(); for ( final Allele alt : vc.getAlternateAlleles() ) { VariantContextBuilder builder = new VariantContextBuilder(vc); final List alleles = Arrays.asList(vc.getReference(), alt); builder.alleles(alleles); builder.genotypes(subsetDiploidAlleles(vc, alleles, false)); calculateChromosomeCounts(builder, true); biallelics.add(reverseTrimAlleles(builder.make())); } return biallelics; } } /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * * @param vc variant context with genotype likelihoods * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs * @return genotypes */ public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, final List allelesToUse, final boolean assignGenotypes) { // the genotypes with PLs final GenotypesContext oldGTs = vc.getGenotypes(); // samples final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); // the new genotypes to create final GenotypesContext newGTs = GenotypesContext.create(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final int numNewAltAlleles = allelesToUse.size() - 1; // which PLs should be carried forward? ArrayList likelihoodIndexesToUse = null; // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the PLs as is; otherwise, we determine which ones to keep if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { likelihoodIndexesToUse = new ArrayList(30); final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; for ( int i = 0; i < numOriginalAltAlleles; i++ ) { if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) altAlleleIndexToUse[i] = true; } // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) likelihoodIndexesToUse.add(PLindex); } } // create the new genotypes for ( int k = 0; k < oldGTs.size(); k++ ) { final Genotype g = oldGTs.get(sampleIndices.get(k)); if ( !g.hasLikelihoods() ) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); continue; } // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); double[] newLikelihoods; if ( likelihoodIndexesToUse == null ) { newLikelihoods = originalLikelihoods; } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; for ( int oldIndex : likelihoodIndexesToUse ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; // might need to re-normalize newLikelihoods = GeneralUtils.normalizeFromLog10(newLikelihoods, false, true); } // if there is no mass on the (new) likelihoods, then just no-call the sample if ( GeneralUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); } else { final GenotypeBuilder gb = new GenotypeBuilder(g); if ( numNewAltAlleles == 0 ) gb.noPL(); else gb.PL(newLikelihoods); // if we weren't asked to assign a genotype, then just no-call the sample if ( !assignGenotypes || GeneralUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) { gb.alleles(NO_CALL_ALLELES); } else { // find the genotype with maximum likelihoods int PLindex = numNewAltAlleles == 0 ? 0 : GeneralUtils.maxElementIndex(newLikelihoods); GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); if ( numNewAltAlleles != 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); } newGTs.add(gb.make()); } } return newGTs; } public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { // see whether we need to trim common reference base from all alleles final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false); if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) return inputVC; final List alleles = new ArrayList(); final GenotypesContext genotypes = GenotypesContext.create(); final Map originalToTrimmedAlleleMap = new HashMap(); for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { alleles.add(a); originalToTrimmedAlleleMap.put(a, a); } else { // get bases for current allele and create a new one with trimmed bases final byte[] newBases = Arrays.copyOfRange(a.getBases(), 0, a.length()-trimExtent); final Allele trimmedAllele = Allele.create(newBases, a.isReference()); alleles.add(trimmedAllele); originalToTrimmedAlleleMap.put(a, trimmedAllele); } } // now we can recreate new genotypes with trimmed alleles for ( final Genotype genotype : inputVC.getGenotypes() ) { final List originalAlleles = genotype.getAlleles(); final List trimmedAlleles = new ArrayList(); for ( final Allele a : originalAlleles ) { if ( a.isCalled() ) trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); else trimmedAlleles.add(Allele.NO_CALL); } genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make()); } return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() - 1).alleles(alleles).genotypes(genotypes).make(); } public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref, final int forwardClipping, final boolean allowFullClip) { int clipping = 0; boolean stillClipping = true; while ( stillClipping ) { for ( final Allele a : unclippedAlleles ) { if ( a.isSymbolic() ) continue; // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). if ( a.length() - clipping == 0 ) return clipping - (allowFullClip ? 0 : 1); if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) { stillClipping = false; } else if ( ref.length == clipping ) { if ( allowFullClip ) stillClipping = false; else return -1; } else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { stillClipping = false; } } if ( stillClipping ) clipping++; } return clipping; } /** * A simple but common wrapper for matching VariantContext objects using JEXL expressions */ public static class JexlVCMatchExp { public String name; public Expression exp; /** * Create a new matcher expression with name and JEXL expression exp * @param name name * @param exp expression */ public JexlVCMatchExp(String name, Expression exp) { this.name = name; this.exp = exp; } } /** * Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to * match() below. * * @param names names * @param exps expressions * @return list of matches */ public static List initializeMatchExps(String[] names, String[] exps) { if ( names == null || exps == null ) throw new IllegalArgumentException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps) ); if ( names.length != exps.length ) throw new IllegalArgumentException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps)); Map map = new HashMap(); for ( int i = 0; i < names.length; i++ ) { map.put(names[i], exps[i]); } return VariantContextUtils.initializeMatchExps(map); } public static List initializeMatchExps(ArrayList names, ArrayList exps) { String[] nameArray = new String[names.size()]; String[] expArray = new String[exps.size()]; return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray)); } /** * Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain * the name associated with each JEXL expression. initializeMatchExps will parse each expression and return * a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to * match() below. * * @param names_and_exps mapping of names to expressions * @return list of matches */ public static List initializeMatchExps(Map names_and_exps) { List exps = new ArrayList(); for ( Map.Entry elt : names_and_exps.entrySet() ) { String name = elt.getKey(); String expStr = elt.getValue(); if ( name == null || expStr == null ) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr); try { Expression exp = engine.createExpression(expStr); exps.add(new JexlVCMatchExp(name, exp)); } catch (Exception e) { throw new IllegalArgumentException("Argument " + name + "has a bad value. Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax.") ; } } return exps; } /** * Returns true if exp match VC. See collection<> version for full docs. * @param vc variant context * @param exp expression * @return true if there is a match */ public static boolean match(VariantContext vc, JexlVCMatchExp exp) { return match(vc,Arrays.asList(exp)).get(exp); } /** * Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL * expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp * expressions. * * @param vc variant context * @param exps expressions * @return true if there is a match */ public static Map match(VariantContext vc, Collection exps) { return new JEXLMap(exps,vc); } /** * Returns true if exp match VC/g. See collection<> version for full docs. * @param vc variant context * @param g genotype * @param exp expression * @return true if there is a match */ public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) { return match(vc,g,Arrays.asList(exp)).get(exp); } /** * Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these * expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL * expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp * expressions. * * @param vc variant context * @param g genotype * @param exps expressions * @return true if there is a match */ public static Map match(VariantContext vc, Genotype g, Collection exps) { return new JEXLMap(exps,vc,g); } public static double computeHardyWeinbergPvalue(VariantContext vc) { if ( vc.getCalledChrCount() == 0 ) return 0.0; return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } /** * Returns a newly allocated VC that is the same as VC, but without genotypes * @param vc variant context * @return new VC without genotypes */ @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { return new VariantContextBuilder(vc).noGenotypes().make(); } /** * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes * @param vcs collection of VCs * @return new VCs without genotypes */ @Requires("vcs != null") @Ensures("result != null") public static Collection sitesOnlyVariantContexts(Collection vcs) { List r = new ArrayList(); for ( VariantContext vc : vcs ) r.add(sitesOnlyVariantContext(vc)); return r; } private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { Map attributes = new HashMap(keysToPreserve.size()); for ( final String key : keysToPreserve ) { if ( igc.hasAttribute(key) ) attributes.put(key, igc.getAttribute(key)); } return attributes; } /** * @deprecated use variant context builder version instead * @param vc the variant context * @param keysToPreserve the keys to preserve * @return a pruned version of the original variant context */ @Deprecated public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); } public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { final VariantContext vc = builder.make(); if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); // VC info final Map attributes = subsetAttributes(vc.commonInfo, keysToPreserve); // Genotypes final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype g : vc.getGenotypes() ) { final GenotypeBuilder gb = new GenotypeBuilder(g); // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ gb.noAD().noDP().noPL().noAttributes(); genotypes.add(gb.make()); } return builder.genotypes(genotypes).attributes(attributes); } public enum GenotypeMergeType { /** * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. */ UNIQUIFY, /** * Take genotypes in priority order (see the priority argument). */ PRIORITIZE, /** * Take the genotypes in any order. */ UNSORTED, /** * Require that all samples/genotypes be unique between all inputs. */ REQUIRE_UNIQUE } public enum FilteredRecordMergeType { /** * Union - leaves the record if any record is unfiltered. */ KEEP_IF_ANY_UNFILTERED, /** * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. */ KEEP_IF_ALL_UNFILTERED, /** * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. */ KEEP_UNCONDITIONAL } public enum MultipleAllelesMergeType { /** * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. */ BY_TYPE, /** * Merge all allele types at the same start position into the same VCF record. */ MIX_TYPES } /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, final List priorityListOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC ) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); return simpleMerge(unsortedVCs,priorityListOfVCs,originalNumOfVCs,filteredRecordMergeType,genotypeMergeOptions,annotateOrigin,printMessages,setKey,filteredAreUncalled,mergeInfoWithMaxAC); } /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name. * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. * * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, final List priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, final GenotypeMergeType genotypeMergeOptions, final boolean annotateOrigin, final boolean printMessages, final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary final List VCs = new ArrayList(); for (final VariantContext vc : preFilteredVCs) { if ( ! filteredAreUncalled || vc.isNotFiltered() ) VCs.add(vc); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC final VariantContext first = VCs.get(0); final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); final Set alleles = new LinkedHashSet(); final Set filters = new HashSet(); final Map attributes = new LinkedHashMap(); final Set inconsistentAttributes = new HashSet(); final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id VariantContext longestVC = first; int depth = 0; int maxAC = -1; final Map attributesWithMaxAC = new LinkedHashMap(); double log10PError = CommonInfo.NO_LOG10_PERROR; VariantContext vcWithMaxAC = null; GenotypesContext genotypes = GenotypesContext.create(); // counting the number of filtered and variant VCs int nFiltered = 0; boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches for ( final VariantContext vc : VCs ) { if ( longestVC.getStart() != vc.getStart() ) throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); if ( getSize(vc) > getSize(longestVC) ) longestVC = vc; // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; if ( vc.isVariant() ) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); alleles.addAll(alleleMapping.values()); mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); // We always take the QUAL of the first VC with a non-MISSING qual for the combined value if ( log10PError == CommonInfo.NO_LOG10_PERROR ) log10PError = vc.getLog10PError(); filters.addAll(vc.getFilters()); // // add attributes // // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if ( vc.hasID() ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); for (String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } else { final int ac = Integer.valueOf(rawAlleleCounts); if (ac > maxAC) { maxAC = ac; vcWithMaxAC = vc; } } } for (final Map.Entry p : vc.getAttributes().entrySet()) { String key = p.getKey(); // if we don't like the key already, don't go anywhere if ( ! inconsistentAttributes.contains(key) ) { final boolean alreadyFound = attributes.containsKey(key); final Object boundValue = attributes.get(key); final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { // we found the value but we're inconsistent, put it in the exclude list //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); inconsistentAttributes.add(key); attributes.remove(key); } else if ( ! alreadyFound || boundIsMissingValue ) { // no value //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); attributes.put(key, p.getValue()); } } } } // if we have more alternate alleles in the merged VC than in one or more of the // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD for ( final VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { if ( GeneralUtils.DEBUG_MODE_ENABLED && ! genotypes.isEmpty() ) { System.err.println(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.alleles)); } genotypes = stripPLsAndAD(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; } } // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); } // if at least one record was unfiltered and we want a union, clear all of the filters if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) filters.clear(); if ( annotateOrigin ) { // we care about where the call came from String setValue; if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; else if ( variantSources.isEmpty() ) // everyone was reference setValue = MERGE_REF_IN_ALL; else { final LinkedHashSet s = new LinkedHashSet(); for ( final VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); setValue = GeneralUtils.join("-", s); } if ( setKey != null ) { attributes.put(setKey, setValue); if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.put(setKey, setValue); } } } if ( depth > 0 ) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : GeneralUtils.join(",", rsIDs); final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); builder.filters(filters.isEmpty() ? filters : new TreeSet(filters)); builder.attributes(new TreeMap(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); while ( it1.hasNext() && it2.hasNext() ) { final Allele a1 = it1.next(); final Allele a2 = it2.next(); if ( ! a1.equals(a2) ) return true; } // by this point, at least one of the iterators is empty. All of the elements // we've compared are equal up until this point. But it's possible that the // sets aren't the same size, which is indicated by the test below. If they // are of the same size, though, the sets are compatible return it1.hasNext() || it2.hasNext(); } public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { // if all alleles of vc1 are a contained in alleles of vc2, return true if (!vc1.getReference().equals(vc2.getReference())) return false; for (Allele a :vc1.getAlternateAlleles()) { if (!vc2.getAlternateAlleles().contains(a)) return false; } return true; } public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for ( final Genotype g : genotypes ) { newGs.add(removePLsAndAD(g)); } return newGs; } public static Map> separateVariantContextsByType(Collection VCs) { HashMap> mappedVCs = new HashMap>(); for ( VariantContext vc : VCs ) { // look at previous variant contexts of different type. If: // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) // c) neither: do nothing, just add vc to its own list boolean addtoOwnList = true; for (VariantContext.Type type : VariantContext.Type.values()) { if (type.equals(vc.getType())) continue; if (!mappedVCs.containsKey(type)) continue; List vcList = mappedVCs.get(type); for (int k=0; k < vcList.size(); k++) { VariantContext otherVC = vcList.get(k); if (allelesAreSubset(otherVC,vc)) { // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list vcList.remove(k); // avoid having empty lists if (vcList.size() == 0) mappedVCs.remove(type); if ( !mappedVCs.containsKey(vc.getType()) ) mappedVCs.put(vc.getType(), new ArrayList()); mappedVCs.get(vc.getType()).add(otherVC); break; } else if (allelesAreSubset(vc,otherVC)) { // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own mappedVCs.get(type).add(vc); addtoOwnList = false; break; } } } if (addtoOwnList) { if ( !mappedVCs.containsKey(vc.getType()) ) mappedVCs.put(vc.getType(), new ArrayList()); mappedVCs.get(vc.getType()).add(vc); } } return mappedVCs; } private static class AlleleMapper { private VariantContext vc = null; private Map map = null; public AlleleMapper(VariantContext vc) { this.vc = vc; } public AlleleMapper(Map map) { this.map = map; } public boolean needsRemapping() { return this.map != null; } public Collection values() { return map != null ? map.values() : vc.getAlleles(); } public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List remap(List as) { List newAs = new ArrayList(); for ( Allele a : as ) { //System.out.printf(" Remapping %s => %s%n", a, remap(a)); newAs.add(remap(a)); } return newAs; } } // TODO: remove that after testing // static private void verifyUniqueSampleNames(Collection unsortedVCs) { // Set names = new HashSet(); // for ( VariantContext vc : unsortedVCs ) { // for ( String name : vc.getSampleNames() ) { // //System.out.printf("Checking %s %b%n", name, names.contains(name)); // if ( names.contains(name) ) // throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); // } // // names.addAll(vc.getSampleNames()); // } // } static private Allele determineReferenceAllele(List VCs) { Allele ref = null; for ( VariantContext vc : VCs ) { Allele myRef = vc.getReference(); if ( ref == null || ref.length() < myRef.length() ) ref = myRef; else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); } return ref; } static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { if ( refAllele.equals(vc.getReference()) ) return new AlleleMapper(vc); else { // we really need to do some work. The refAllele is the longest reference allele seen at this // start site. So imagine it is: // // refAllele: ACGTGA // myRef: ACGT // myAlt: A // // We need to remap all of the alleles in vc to include the extra GA so that // myRef => refAllele and myAlt => AGA // Allele myRef = vc.getReference(); if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); // System.out.printf("Remapping allele at %s%n", vc); // System.out.printf("ref %s%n", refAllele); // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); Map map = new HashMap(); for ( Allele a : vc.getAlleles() ) { if ( a.isReference() ) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); for ( Allele b : allAlleles ) if ( extended.equals(b) ) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); map.put(a, extended); } } // debugging // System.out.printf("mapping %s%n", map); return new AlleleMapper(map); } } static class CompareByPriority implements Comparator, Serializable { List priorityListOfVCs; public CompareByPriority(List priorityListOfVCs) { this.priorityListOfVCs = priorityListOfVCs; } private int getIndex(VariantContext vc) { int i = priorityListOfVCs.indexOf(vc.getSource()); if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); return i; } public int compare(VariantContext vc1, VariantContext vc2) { return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); } } public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) return new ArrayList(unsortedVCs); else { ArrayList sorted = new ArrayList(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); return sorted; } } private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE for ( Genotype g : oneVC.getGenotypes() ) { String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); if ( ! mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; if ( uniqifySamples || alleleMapping.needsRemapping() ) { final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); } mergedGenotypes.add(newG); } } } public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { return uniqify ? sampleName + "." + trackName : sampleName; } /** * Returns a context identical to this with the REF and ALT alleles reverse complemented. * * @param vc variant context * @return new vc */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele HashMap alleleMap = new HashMap(vc.getAlleles().size()); for ( Allele originalAllele : vc.getAlleles() ) { Allele newAllele; if ( originalAllele.isNoCall() ) newAllele = originalAllele; else newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); alleleMap.put(originalAllele, newAllele); } // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { List newAlleles = new ArrayList(); for ( Allele allele : genotype.getAlleles() ) { Allele newAllele = alleleMap.get(allele); if ( newAllele == null ) newAllele = Allele.NO_CALL; newAlleles.add(newAllele); } newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); } return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); } public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { if ( allowedAttributes == null ) return vc; GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { Map attrs = new HashMap(); for ( Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); } return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); } public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { if (!context.isSNP() || !context.isBiallelic()) throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); } /** * If this is a BiAlleic SNP, is it a transition? */ public static boolean isTransition(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; } /** * If this is a BiAlleic SNP, is it a transversion? */ public static boolean isTransversion(VariantContext context) { return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } public static boolean isTransition(Allele ref, Allele alt) { return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; } public static boolean isTransversion(Allele ref, Allele alt) { return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; } public static int getSize( VariantContext vc ) { return vc.getEnd() - vc.getStart() + 1; } public static final Set genotypeNames(final Collection genotypes) { final Set names = new HashSet(genotypes.size()); for ( final Genotype g : genotypes ) names.add(g.getSampleName()); return names; } /** * Compute the end position for this VariantContext from the alleles themselves * * In the trivial case this is a single BP event and end = start (open intervals) * In general the end is start + ref length - 1, handling the case where ref length == 0 * However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases * * @param alleles the list of alleles to consider. The reference allele must be the first one * @param start the known start position of this event * @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1 * if no is expected but will throw an error if one is found * @return this builder */ @Requires({"! alleles.isEmpty()", "start > 0", "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" }) public static int computeEndFromAlleles(final List alleles, final int start, final int endForSymbolicAlleles) { final Allele ref = alleles.get(0); if ( ref.isNonReference() ) throw new IllegalStateException("computeEndFromAlleles requires first allele to be reference"); if ( VariantContext.hasSymbolicAlleles(alleles) ) { if ( endForSymbolicAlleles == -1 ) throw new IllegalStateException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided"); return endForSymbolicAlleles; } else { return start + Math.max(ref.length() - 1, 0); } } public static boolean requiresPaddingBase(final List alleles) { // see whether one of the alleles would be null if trimmed through for ( final String allele : alleles ) { if ( allele.isEmpty() ) return true; } int clipping = 0; Character currentBase = null; while ( true ) { for ( final String allele : alleles ) { if ( allele.length() - clipping == 0 ) return true; char myBase = allele.charAt(clipping); if ( currentBase == null ) currentBase = myBase; else if ( currentBase != myBase ) return false; } clipping++; currentBase = null; } } }