2010-03-16 00:00:55 +08:00
/ *
2013-01-11 06:04:08 +08:00
* Copyright ( c ) 2012 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2010-03-16 00:00:55 +08:00
2012-12-19 03:56:48 +08:00
package org.broadinstitute.variant.variantcontext ;
2010-01-26 21:53:29 +08:00
2011-07-18 08:29:58 +08:00
import com.google.java.contract.Ensures ;
import com.google.java.contract.Requires ;
import org.apache.commons.jexl2.Expression ;
import org.apache.commons.jexl2.JexlEngine ;
2013-01-09 03:45:50 +08:00
import org.apache.commons.lang.ArrayUtils ;
2012-12-19 03:56:48 +08:00
import org.broad.tribble.TribbleException ;
2010-08-12 01:01:38 +08:00
import org.broad.tribble.util.popgen.HardyWeinbergCalculation ;
2012-12-19 03:56:48 +08:00
import org.broadinstitute.variant.utils.BaseUtils ;
2013-01-09 03:45:50 +08:00
import org.broadinstitute.variant.utils.GeneralUtils ;
import org.broadinstitute.variant.utils.Pair ;
2012-12-19 03:56:48 +08:00
import org.broadinstitute.variant.vcf.* ;
2010-01-26 21:53:29 +08:00
2011-07-18 08:29:58 +08:00
import java.io.Serializable ;
import java.util.* ;
2010-01-26 21:53:29 +08:00
public class VariantContextUtils {
2011-09-23 05:05:12 +08:00
public final static String MERGE_INTERSECTION = "Intersection" ;
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll" ;
public final static String MERGE_REF_IN_ALL = "ReferenceInAll" ;
public final static String MERGE_FILTER_PREFIX = "filterIn" ;
2013-01-09 03:45:50 +08:00
public static final int DEFAULT_PLOIDY = 2 ;
public static final double SUM_GL_THRESH_NOCALL = - 0.1 ; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
2012-06-29 10:36:26 +08:00
private static Set < String > MISSING_KEYS_WARNED_ABOUT = new HashSet < String > ( ) ;
2013-01-09 03:45:50 +08:00
private static final List < Allele > NO_CALL_ALLELES = Arrays . asList ( Allele . NO_CALL , Allele . NO_CALL ) ;
2011-09-23 05:05:12 +08:00
2010-07-19 00:29:59 +08:00
final public static JexlEngine engine = new JexlEngine ( ) ;
2012-05-23 04:27:13 +08:00
private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false ;
2012-04-10 02:53:05 +08:00
2010-11-18 00:08:16 +08:00
static {
2012-06-22 03:14:06 +08:00
engine . setSilent ( false ) ; // will throw errors now for selects that don't evaluate properly
engine . setLenient ( false ) ;
2012-06-21 21:54:36 +08:00
engine . setDebug ( false ) ;
2010-11-18 00:08:16 +08:00
}
2010-06-07 08:33:02 +08:00
2010-10-08 12:37:54 +08:00
/ * *
2011-11-20 07:40:15 +08:00
* Update the attributes of the attributes map given the VariantContext to reflect the
* proper chromosome - based VCF tags
2010-10-08 12:37:54 +08:00
*
* @param vc the VariantContext
* @param attributes the attributes map to populate ; must not be null ; may contain old values
* @param removeStaleValues should we remove stale values from the mapping ?
2011-11-20 07:40:15 +08:00
* @return the attributes map provided as input , returned for programming convenience
2010-10-08 12:37:54 +08:00
* /
2011-11-20 07:40:15 +08:00
public static Map < String , Object > calculateChromosomeCounts ( VariantContext vc , Map < String , Object > attributes , boolean removeStaleValues ) {
2012-04-25 23:52:23 +08:00
return calculateChromosomeCounts ( vc , attributes , removeStaleValues , new HashSet < String > ( 0 ) ) ;
}
/ * *
* Update the attributes of the attributes map given the VariantContext to reflect the
* proper chromosome - based VCF tags
*
* @param vc the VariantContext
* @param attributes the attributes map to populate ; must not be null ; may contain old values
* @param removeStaleValues should we remove stale values from the mapping ?
* @param founderIds - Set of founders Ids to take into account . AF and FC will be calculated over the founders .
* If empty or null , counts are generated for all samples as unrelated individuals
* @return the attributes map provided as input , returned for programming convenience
* /
public static Map < String , Object > calculateChromosomeCounts ( VariantContext vc , Map < String , Object > attributes , boolean removeStaleValues , final Set < String > founderIds ) {
2012-02-28 03:56:10 +08:00
final int AN = vc . getCalledChrCount ( ) ;
2010-10-08 12:37:54 +08:00
// if everyone is a no-call, remove the old attributes if requested
2012-02-28 03:56:10 +08:00
if ( AN = = 0 & & removeStaleValues ) {
2010-10-08 12:37:54 +08:00
if ( attributes . containsKey ( VCFConstants . ALLELE_COUNT_KEY ) )
attributes . remove ( VCFConstants . ALLELE_COUNT_KEY ) ;
if ( attributes . containsKey ( VCFConstants . ALLELE_FREQUENCY_KEY ) )
attributes . remove ( VCFConstants . ALLELE_FREQUENCY_KEY ) ;
if ( attributes . containsKey ( VCFConstants . ALLELE_NUMBER_KEY ) )
attributes . remove ( VCFConstants . ALLELE_NUMBER_KEY ) ;
2011-11-20 07:40:15 +08:00
return attributes ;
2010-10-08 12:37:54 +08:00
}
2011-03-07 03:25:52 +08:00
if ( vc . hasGenotypes ( ) ) {
2012-02-28 03:56:10 +08:00
attributes . put ( VCFConstants . ALLELE_NUMBER_KEY , AN ) ;
2010-10-08 12:37:54 +08:00
2011-03-07 03:25:52 +08:00
// if there are alternate alleles, record the relevant tags
if ( vc . getAlternateAlleles ( ) . size ( ) > 0 ) {
2012-06-02 07:25:11 +08:00
ArrayList < Double > alleleFreqs = new ArrayList < Double > ( ) ;
2012-04-25 23:52:23 +08:00
ArrayList < Integer > alleleCounts = new ArrayList < Integer > ( ) ;
ArrayList < Integer > foundersAlleleCounts = new ArrayList < Integer > ( ) ;
double totalFoundersChromosomes = ( double ) vc . getCalledChrCount ( founderIds ) ;
int foundersAltChromosomes ;
2011-03-07 03:25:52 +08:00
for ( Allele allele : vc . getAlternateAlleles ( ) ) {
2012-04-25 23:52:23 +08:00
foundersAltChromosomes = vc . getCalledChrCount ( allele , founderIds ) ;
alleleCounts . add ( vc . getCalledChrCount ( allele ) ) ;
foundersAlleleCounts . add ( foundersAltChromosomes ) ;
2012-02-28 03:56:10 +08:00
if ( AN = = 0 ) {
2012-06-02 07:25:11 +08:00
alleleFreqs . add ( 0.0 ) ;
2012-02-28 03:56:10 +08:00
} else {
2012-06-10 20:55:23 +08:00
final Double freq = ( double ) foundersAltChromosomes / totalFoundersChromosomes ;
2012-02-28 03:56:10 +08:00
alleleFreqs . add ( freq ) ;
}
2011-03-07 03:25:52 +08:00
}
2010-10-08 12:37:54 +08:00
2011-03-07 03:25:52 +08:00
attributes . put ( VCFConstants . ALLELE_COUNT_KEY , alleleCounts . size ( ) = = 1 ? alleleCounts . get ( 0 ) : alleleCounts ) ;
attributes . put ( VCFConstants . ALLELE_FREQUENCY_KEY , alleleFreqs . size ( ) = = 1 ? alleleFreqs . get ( 0 ) : alleleFreqs ) ;
2012-06-30 23:22:27 +08:00
} else {
// if there's no alt AC and AF shouldn't be present
attributes . remove ( VCFConstants . ALLELE_COUNT_KEY ) ;
attributes . remove ( VCFConstants . ALLELE_FREQUENCY_KEY ) ;
2011-03-07 03:25:52 +08:00
}
2010-10-08 12:37:54 +08:00
}
2011-11-20 07:40:15 +08:00
return attributes ;
}
/ * *
* Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
* chromosome - based VCF tags based on the current VC produced by builder . make ( )
*
* @param builder the VariantContextBuilder we are updating
* @param removeStaleValues should we remove stale values from the mapping ?
* /
public static void calculateChromosomeCounts ( VariantContextBuilder builder , boolean removeStaleValues ) {
2012-04-25 23:52:23 +08:00
VariantContext vc = builder . make ( ) ;
builder . attributes ( calculateChromosomeCounts ( vc , new HashMap < String , Object > ( vc . getAttributes ( ) ) , removeStaleValues , new HashSet < String > ( 0 ) ) ) ;
}
/ * *
* Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
* chromosome - based VCF tags based on the current VC produced by builder . make ( )
*
* @param builder the VariantContextBuilder we are updating
* @param founderIds - Set of founders to take into account . AF and FC will be calculated over the founders only .
* If empty or null , counts are generated for all samples as unrelated individuals
* @param removeStaleValues should we remove stale values from the mapping ?
* /
public static void calculateChromosomeCounts ( VariantContextBuilder builder , boolean removeStaleValues , final Set < String > founderIds ) {
VariantContext vc = builder . make ( ) ;
builder . attributes ( calculateChromosomeCounts ( vc , new HashMap < String , Object > ( vc . getAttributes ( ) ) , removeStaleValues , founderIds ) ) ;
2010-10-08 12:37:54 +08:00
}
2012-10-07 11:02:36 +08:00
public static Genotype removePLsAndAD ( final Genotype g ) {
return ( g . hasLikelihoods ( ) | | g . hasAD ( ) ) ? new GenotypeBuilder ( g ) . noPL ( ) . noAD ( ) . make ( ) : g ;
2011-09-23 20:23:46 +08:00
}
2012-05-22 19:14:46 +08:00
public final static VCFCompoundHeaderLine getMetaDataForField ( final VCFHeader header , final String field ) {
VCFCompoundHeaderLine metaData = header . getFormatHeaderLine ( field ) ;
if ( metaData = = null ) metaData = header . getInfoHeaderLine ( field ) ;
if ( metaData = = null ) {
if ( ASSUME_MISSING_FIELDS_ARE_STRINGS ) {
if ( ! MISSING_KEYS_WARNED_ABOUT . contains ( field ) ) {
MISSING_KEYS_WARNED_ABOUT . add ( field ) ;
2013-01-09 03:45:50 +08:00
if ( GeneralUtils . DEBUG_MODE_ENABLED )
System . err . println ( "Field " + field + " missing from VCF header, assuming it is an unbounded string type" ) ;
2012-05-22 19:14:46 +08:00
}
return new VCFInfoHeaderLine ( field , VCFHeaderLineCount . UNBOUNDED , VCFHeaderLineType . String , "Auto-generated string header for " + field ) ;
}
else
2012-12-19 03:56:48 +08:00
throw new TribbleException ( "Fully decoding VariantContext requires header line for all fields, but none was found for " + field ) ;
2012-05-22 19:14:46 +08:00
}
return metaData ;
}
2013-01-09 03:45:50 +08:00
/ * *
* Returns true iff VC is an non - complex indel where every allele represents an expansion or
* contraction of a series of identical bases in the reference .
*
* For example , suppose the ref bases are CTCTCTGA , which includes a 3 x repeat of CTCTCT
*
* If VC = - / CT , then this function returns true because the CT insertion matches exactly the
* upcoming reference .
* If VC = - / CTA then this function returns false because the CTA isn ' t a perfect match
*
* Now consider deletions :
*
* If VC = CT / - then again the same logic applies and this returns true
* The case of CTA / - makes no sense because it doesn ' t actually match the reference bases .
*
* The logic of this function is pretty simple . Take all of the non - null alleles in VC . For
* each insertion allele of n bases , check if that allele matches the next n reference bases .
* For each deletion allele of n bases , check if this matches the reference bases at n - 2 n ,
* as it must necessarily match the first n bases . If this test returns true for all
* alleles you are a tandem repeat , otherwise you are not .
*
* @param vc
* @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference
* @return
* /
@Requires ( { "vc != null" , "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0" } )
public static boolean isTandemRepeat ( final VariantContext vc , final byte [ ] refBasesStartingAtVCWithPad ) {
final String refBasesStartingAtVCWithoutPad = new String ( refBasesStartingAtVCWithPad ) . substring ( 1 ) ;
if ( ! vc . isIndel ( ) ) // only indels are tandem repeats
return false ;
final Allele ref = vc . getReference ( ) ;
for ( final Allele allele : vc . getAlternateAlleles ( ) ) {
if ( ! isRepeatAllele ( ref , allele , refBasesStartingAtVCWithoutPad ) )
return false ;
}
// we've passed all of the tests, so we are a repeat
return true ;
}
/ * *
*
* @param vc
* @param refBasesStartingAtVCWithPad
* @return
* /
@Requires ( { "vc != null" , "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0" } )
public static Pair < List < Integer > , byte [ ] > getNumTandemRepeatUnits ( final VariantContext vc , final byte [ ] refBasesStartingAtVCWithPad ) {
final boolean VERBOSE = false ;
final String refBasesStartingAtVCWithoutPad = new String ( refBasesStartingAtVCWithPad ) . substring ( 1 ) ;
if ( ! vc . isIndel ( ) ) // only indels are tandem repeats
return null ;
final Allele refAllele = vc . getReference ( ) ;
final byte [ ] refAlleleBases = Arrays . copyOfRange ( refAllele . getBases ( ) , 1 , refAllele . length ( ) ) ;
byte [ ] repeatUnit = null ;
final ArrayList < Integer > lengths = new ArrayList < Integer > ( ) ;
for ( final Allele allele : vc . getAlternateAlleles ( ) ) {
Pair < int [ ] , byte [ ] > result = getNumTandemRepeatUnits ( refAlleleBases , Arrays . copyOfRange ( allele . getBases ( ) , 1 , allele . length ( ) ) , refBasesStartingAtVCWithoutPad . getBytes ( ) ) ;
final int [ ] repetitionCount = result . first ;
// repetition count = 0 means allele is not a tandem expansion of context
if ( repetitionCount [ 0 ] = = 0 | | repetitionCount [ 1 ] = = 0 )
return null ;
if ( lengths . size ( ) = = 0 ) {
lengths . add ( repetitionCount [ 0 ] ) ; // add ref allele length only once
}
lengths . add ( repetitionCount [ 1 ] ) ; // add this alt allele's length
repeatUnit = result . second ;
if ( VERBOSE ) {
System . out . println ( "RefContext:" + refBasesStartingAtVCWithoutPad ) ;
System . out . println ( "Ref:" + refAllele . toString ( ) + " Count:" + String . valueOf ( repetitionCount [ 0 ] ) ) ;
System . out . println ( "Allele:" + allele . toString ( ) + " Count:" + String . valueOf ( repetitionCount [ 1 ] ) ) ;
System . out . println ( "RU:" + new String ( repeatUnit ) ) ;
}
}
return new Pair < List < Integer > , byte [ ] > ( lengths , repeatUnit ) ;
}
protected static Pair < int [ ] , byte [ ] > getNumTandemRepeatUnits ( final byte [ ] refBases , final byte [ ] altBases , final byte [ ] remainingRefContext ) {
/ * we can ' t exactly apply same logic as in basesAreRepeated ( ) to compute tandem unit and number of repeated units .
Consider case where ref = ATATAT and we have an insertion of ATAT . Natural description is ( AT ) 3 - > ( AT ) 5.
* /
byte [ ] longB ;
// find first repeat unit based on either ref or alt, whichever is longer
if ( altBases . length > refBases . length )
longB = altBases ;
else
longB = refBases ;
// see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units
// for example, -*,CACA needs to first be decomposed into (CA)2
final int repeatUnitLength = findRepeatedSubstring ( longB ) ;
final byte [ ] repeatUnit = Arrays . copyOf ( longB , repeatUnitLength ) ;
final int [ ] repetitionCount = new int [ 2 ] ;
// repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext));
// repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext));
int repetitionsInRef = findNumberofRepetitions ( repeatUnit , refBases ) ;
repetitionCount [ 0 ] = findNumberofRepetitions ( repeatUnit , ArrayUtils . addAll ( refBases , remainingRefContext ) ) - repetitionsInRef ;
repetitionCount [ 1 ] = findNumberofRepetitions ( repeatUnit , ArrayUtils . addAll ( altBases , remainingRefContext ) ) - repetitionsInRef ;
return new Pair < int [ ] , byte [ ] > ( repetitionCount , repeatUnit ) ;
}
/ * *
* Find out if a string can be represented as a tandem number of substrings .
* For example ACTACT is a 2 - tandem of ACT ,
* but ACTACA is not .
*
* @param bases String to be tested
* @return Length of repeat unit , if string can be represented as tandem of substring ( if it can ' t
* be represented as one , it will be just the length of the input string )
* /
public static int findRepeatedSubstring ( byte [ ] bases ) {
int repLength ;
for ( repLength = 1 ; repLength < = bases . length ; repLength + + ) {
final byte [ ] candidateRepeatUnit = Arrays . copyOf ( bases , repLength ) ;
boolean allBasesMatch = true ;
for ( int start = repLength ; start < bases . length ; start + = repLength ) {
// check that remaining of string is exactly equal to repeat unit
final byte [ ] basePiece = Arrays . copyOfRange ( bases , start , start + candidateRepeatUnit . length ) ;
if ( ! Arrays . equals ( candidateRepeatUnit , basePiece ) ) {
allBasesMatch = false ;
break ;
}
}
if ( allBasesMatch )
return repLength ;
}
return repLength ;
}
/ * *
* Helper routine that finds number of repetitions a string consists of .
* For example , for string ATAT and repeat unit AT , number of repetitions = 2
* @param repeatUnit Substring
* @param testString String to test
* @return Number of repetitions ( 0 if testString is not a concatenation of n repeatUnit ' s
* /
public static int findNumberofRepetitions ( byte [ ] repeatUnit , byte [ ] testString ) {
int numRepeats = 0 ;
for ( int start = 0 ; start < testString . length ; start + = repeatUnit . length ) {
int end = start + repeatUnit . length ;
byte [ ] unit = Arrays . copyOfRange ( testString , start , end ) ;
if ( Arrays . equals ( unit , repeatUnit ) )
numRepeats + + ;
else
return numRepeats ;
}
return numRepeats ;
}
/ * *
* Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
* @param ref
* @param alt
* @param refBasesStartingAtVCWithoutPad
* @return
* /
protected static boolean isRepeatAllele ( final Allele ref , final Allele alt , final String refBasesStartingAtVCWithoutPad ) {
if ( ! Allele . oneIsPrefixOfOther ( ref , alt ) )
return false ; // we require one allele be a prefix of another
if ( ref . length ( ) > alt . length ( ) ) { // we are a deletion
return basesAreRepeated ( ref . getBaseString ( ) , alt . getBaseString ( ) , refBasesStartingAtVCWithoutPad , 2 ) ;
} else { // we are an insertion
return basesAreRepeated ( alt . getBaseString ( ) , ref . getBaseString ( ) , refBasesStartingAtVCWithoutPad , 1 ) ;
}
}
protected static boolean basesAreRepeated ( final String l , final String s , final String ref , final int minNumberOfMatches ) {
final String potentialRepeat = l . substring ( s . length ( ) ) ; // skip s bases
for ( int i = 0 ; i < minNumberOfMatches ; i + + ) {
final int start = i * potentialRepeat . length ( ) ;
final int end = ( i + 1 ) * potentialRepeat . length ( ) ;
if ( ref . length ( ) < end )
return false ; // we ran out of bases to test
final String refSub = ref . substring ( start , end ) ;
if ( ! refSub . equals ( potentialRepeat ) )
return false ; // repeat didn't match, fail
}
return true ; // we passed all tests, we matched
}
/ * *
* Assign genotypes ( GTs ) to the samples in the Variant Context greedily based on the PLs
*
* @param vc variant context with genotype likelihoods
* @return genotypes context
* /
public static GenotypesContext assignDiploidGenotypes ( final VariantContext vc ) {
return subsetDiploidAlleles ( vc , vc . getAlleles ( ) , true ) ;
}
/ * *
* Split variant context into its biallelic components if there are more than 2 alleles
*
* For VC has A / B / C alleles , returns A / B and A / C contexts .
* Genotypes are all no - calls now ( it ' s not possible to fix them easily )
* Alleles are right trimmed to satisfy VCF conventions
*
* If vc is biallelic or non - variant it is just returned
*
* Chromosome counts are updated ( but they are by definition 0 )
*
* @param vc a potentially multi - allelic variant context
* @return a list of bi - allelic ( or monomorphic ) variant context
* /
public static List < VariantContext > splitVariantContextToBiallelics ( final VariantContext vc ) {
if ( ! vc . isVariant ( ) | | vc . isBiallelic ( ) )
// non variant or biallelics already satisfy the contract
return Collections . singletonList ( vc ) ;
else {
final List < VariantContext > biallelics = new LinkedList < VariantContext > ( ) ;
for ( final Allele alt : vc . getAlternateAlleles ( ) ) {
VariantContextBuilder builder = new VariantContextBuilder ( vc ) ;
final List < Allele > alleles = Arrays . asList ( vc . getReference ( ) , alt ) ;
builder . alleles ( alleles ) ;
builder . genotypes ( subsetDiploidAlleles ( vc , alleles , false ) ) ;
calculateChromosomeCounts ( builder , true ) ;
biallelics . add ( reverseTrimAlleles ( builder . make ( ) ) ) ;
}
return biallelics ;
}
}
/ * *
* subset the Variant Context to the specific set of alleles passed in ( pruning the PLs appropriately )
*
* @param vc variant context with genotype likelihoods
* @param allelesToUse which alleles from the vc are okay to use ; * * * must be in the same relative order as those in the original VC * * *
* @param assignGenotypes true if we should update the genotypes based on the ( subsetted ) PLs
* @return genotypes
* /
public static GenotypesContext subsetDiploidAlleles ( final VariantContext vc ,
final List < Allele > allelesToUse ,
final boolean assignGenotypes ) {
// the genotypes with PLs
final GenotypesContext oldGTs = vc . getGenotypes ( ) ;
// samples
final List < String > sampleIndices = oldGTs . getSampleNamesOrderedByName ( ) ;
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext . create ( ) ;
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
final int numOriginalAltAlleles = vc . getAlternateAlleles ( ) . size ( ) ;
final int numNewAltAlleles = allelesToUse . size ( ) - 1 ;
// which PLs should be carried forward?
ArrayList < Integer > likelihoodIndexesToUse = null ;
// an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles,
// then we can keep the PLs as is; otherwise, we determine which ones to keep
if ( numNewAltAlleles ! = numOriginalAltAlleles & & numNewAltAlleles > 0 ) {
likelihoodIndexesToUse = new ArrayList < Integer > ( 30 ) ;
final boolean [ ] altAlleleIndexToUse = new boolean [ numOriginalAltAlleles ] ;
for ( int i = 0 ; i < numOriginalAltAlleles ; i + + ) {
if ( allelesToUse . contains ( vc . getAlternateAllele ( i ) ) )
altAlleleIndexToUse [ i ] = true ;
}
// numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2
final int numLikelihoods = GenotypeLikelihoods . numLikelihoods ( 1 + numOriginalAltAlleles , DEFAULT_PLOIDY ) ;
for ( int PLindex = 0 ; PLindex < numLikelihoods ; PLindex + + ) {
final GenotypeLikelihoods . GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods . getAllelePair ( PLindex ) ;
// consider this entry only if both of the alleles are good
if ( ( alleles . alleleIndex1 = = 0 | | altAlleleIndexToUse [ alleles . alleleIndex1 - 1 ] ) & & ( alleles . alleleIndex2 = = 0 | | altAlleleIndexToUse [ alleles . alleleIndex2 - 1 ] ) )
likelihoodIndexesToUse . add ( PLindex ) ;
}
}
// create the new genotypes
for ( int k = 0 ; k < oldGTs . size ( ) ; k + + ) {
final Genotype g = oldGTs . get ( sampleIndices . get ( k ) ) ;
if ( ! g . hasLikelihoods ( ) ) {
newGTs . add ( GenotypeBuilder . create ( g . getSampleName ( ) , NO_CALL_ALLELES ) ) ;
continue ;
}
// create the new likelihoods array from the alleles we are allowed to use
final double [ ] originalLikelihoods = g . getLikelihoods ( ) . getAsVector ( ) ;
double [ ] newLikelihoods ;
if ( likelihoodIndexesToUse = = null ) {
newLikelihoods = originalLikelihoods ;
} else {
newLikelihoods = new double [ likelihoodIndexesToUse . size ( ) ] ;
int newIndex = 0 ;
for ( int oldIndex : likelihoodIndexesToUse )
newLikelihoods [ newIndex + + ] = originalLikelihoods [ oldIndex ] ;
// might need to re-normalize
newLikelihoods = GeneralUtils . normalizeFromLog10 ( newLikelihoods , false , true ) ;
}
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( GeneralUtils . sum ( newLikelihoods ) > SUM_GL_THRESH_NOCALL ) {
newGTs . add ( GenotypeBuilder . create ( g . getSampleName ( ) , NO_CALL_ALLELES ) ) ;
}
else {
final GenotypeBuilder gb = new GenotypeBuilder ( g ) ;
if ( numNewAltAlleles = = 0 )
gb . noPL ( ) ;
else
gb . PL ( newLikelihoods ) ;
// if we weren't asked to assign a genotype, then just no-call the sample
if ( ! assignGenotypes | | GeneralUtils . sum ( newLikelihoods ) > SUM_GL_THRESH_NOCALL ) {
gb . alleles ( NO_CALL_ALLELES ) ;
}
else {
// find the genotype with maximum likelihoods
int PLindex = numNewAltAlleles = = 0 ? 0 : GeneralUtils . maxElementIndex ( newLikelihoods ) ;
GenotypeLikelihoods . GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods . getAllelePair ( PLindex ) ;
gb . alleles ( Arrays . asList ( allelesToUse . get ( alleles . alleleIndex1 ) , allelesToUse . get ( alleles . alleleIndex2 ) ) ) ;
if ( numNewAltAlleles ! = 0 ) gb . log10PError ( GenotypeLikelihoods . getGQLog10FromLikelihoods ( PLindex , newLikelihoods ) ) ;
}
newGTs . add ( gb . make ( ) ) ;
}
}
return newGTs ;
}
public static VariantContext reverseTrimAlleles ( final VariantContext inputVC ) {
// see whether we need to trim common reference base from all alleles
final int trimExtent = computeReverseClipping ( inputVC . getAlleles ( ) , inputVC . getReference ( ) . getDisplayString ( ) . getBytes ( ) , 0 , false ) ;
if ( trimExtent < = 0 | | inputVC . getAlleles ( ) . size ( ) < = 1 )
return inputVC ;
final List < Allele > alleles = new ArrayList < Allele > ( ) ;
final GenotypesContext genotypes = GenotypesContext . create ( ) ;
final Map < Allele , Allele > originalToTrimmedAlleleMap = new HashMap < Allele , Allele > ( ) ;
for ( final Allele a : inputVC . getAlleles ( ) ) {
if ( a . isSymbolic ( ) ) {
alleles . add ( a ) ;
originalToTrimmedAlleleMap . put ( a , a ) ;
} else {
// get bases for current allele and create a new one with trimmed bases
final byte [ ] newBases = Arrays . copyOfRange ( a . getBases ( ) , 0 , a . length ( ) - trimExtent ) ;
final Allele trimmedAllele = Allele . create ( newBases , a . isReference ( ) ) ;
alleles . add ( trimmedAllele ) ;
originalToTrimmedAlleleMap . put ( a , trimmedAllele ) ;
}
}
// now we can recreate new genotypes with trimmed alleles
for ( final Genotype genotype : inputVC . getGenotypes ( ) ) {
final List < Allele > originalAlleles = genotype . getAlleles ( ) ;
final List < Allele > trimmedAlleles = new ArrayList < Allele > ( ) ;
for ( final Allele a : originalAlleles ) {
if ( a . isCalled ( ) )
trimmedAlleles . add ( originalToTrimmedAlleleMap . get ( a ) ) ;
else
trimmedAlleles . add ( Allele . NO_CALL ) ;
}
genotypes . add ( new GenotypeBuilder ( genotype ) . alleles ( trimmedAlleles ) . make ( ) ) ;
}
return new VariantContextBuilder ( inputVC ) . stop ( inputVC . getStart ( ) + alleles . get ( 0 ) . length ( ) - 1 ) . alleles ( alleles ) . genotypes ( genotypes ) . make ( ) ;
}
public static int computeReverseClipping ( final List < Allele > unclippedAlleles ,
final byte [ ] ref ,
final int forwardClipping ,
final boolean allowFullClip ) {
int clipping = 0 ;
boolean stillClipping = true ;
while ( stillClipping ) {
for ( final Allele a : unclippedAlleles ) {
if ( a . isSymbolic ( ) )
continue ;
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
if ( a . length ( ) - clipping = = 0 )
return clipping - ( allowFullClip ? 0 : 1 ) ;
if ( a . length ( ) - clipping < = forwardClipping | | a . length ( ) - forwardClipping = = 0 ) {
stillClipping = false ;
}
else if ( ref . length = = clipping ) {
if ( allowFullClip )
stillClipping = false ;
else
return - 1 ;
}
else if ( a . getBases ( ) [ a . length ( ) - clipping - 1 ] ! = ref [ ref . length - clipping - 1 ] ) {
stillClipping = false ;
}
}
if ( stillClipping )
clipping + + ;
}
return clipping ;
}
2010-10-08 12:37:54 +08:00
/ * *
2010-02-05 21:33:27 +08:00
* A simple but common wrapper for matching VariantContext objects using JEXL expressions
* /
public static class JexlVCMatchExp {
2010-02-02 22:18:46 +08:00
public String name ;
public Expression exp ;
2010-01-31 04:51:24 +08:00
2010-02-05 21:33:27 +08:00
/ * *
* Create a new matcher expression with name and JEXL expression exp
2010-03-05 05:28:17 +08:00
* @param name name
* @param exp expression
2010-02-05 21:33:27 +08:00
* /
public JexlVCMatchExp ( String name , Expression exp ) {
2010-01-31 04:51:24 +08:00
this . name = name ;
this . exp = exp ;
}
}
2010-02-05 21:33:27 +08:00
/ * *
* Method for creating JexlVCMatchExp from input walker arguments names and exps . These two arrays contain
* the name associated with each JEXL expression . initializeMatchExps will parse each expression and return
* a list of JexlVCMatchExp , in order , that correspond to the names and exps . These are suitable input to
* match ( ) below .
*
2010-03-05 05:28:17 +08:00
* @param names names
* @param exps expressions
* @return list of matches
2010-02-05 21:33:27 +08:00
* /
public static List < JexlVCMatchExp > initializeMatchExps ( String [ ] names , String [ ] exps ) {
2010-02-02 22:18:46 +08:00
if ( names = = null | | exps = = null )
2012-12-19 03:56:48 +08:00
throw new IllegalArgumentException ( "BUG: neither names nor exps can be null: names " + Arrays . toString ( names ) + " exps=" + Arrays . toString ( exps ) ) ;
2010-02-02 22:18:46 +08:00
if ( names . length ! = exps . length )
2012-12-19 03:56:48 +08:00
throw new IllegalArgumentException ( "Inconsistent number of provided filter names and expressions: names=" + Arrays . toString ( names ) + " exps=" + Arrays . toString ( exps ) ) ;
2010-02-02 22:18:46 +08:00
Map < String , String > map = new HashMap < String , String > ( ) ;
for ( int i = 0 ; i < names . length ; i + + ) { map . put ( names [ i ] , exps [ i ] ) ; }
return VariantContextUtils . initializeMatchExps ( map ) ;
}
2010-07-30 22:16:43 +08:00
public static List < JexlVCMatchExp > initializeMatchExps ( ArrayList < String > names , ArrayList < String > exps ) {
String [ ] nameArray = new String [ names . size ( ) ] ;
String [ ] expArray = new String [ exps . size ( ) ] ;
return initializeMatchExps ( names . toArray ( nameArray ) , exps . toArray ( expArray ) ) ;
}
2010-02-05 21:33:27 +08:00
/ * *
* Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps . These two arrays contain
* the name associated with each JEXL expression . initializeMatchExps will parse each expression and return
* a list of JexlVCMatchExp , in order , that correspond to the names and exps . These are suitable input to
* match ( ) below .
*
2010-03-05 05:28:17 +08:00
* @param names_and_exps mapping of names to expressions
* @return list of matches
2010-02-05 21:33:27 +08:00
* /
public static List < JexlVCMatchExp > initializeMatchExps ( Map < String , String > names_and_exps ) {
List < JexlVCMatchExp > exps = new ArrayList < JexlVCMatchExp > ( ) ;
2010-01-31 04:51:24 +08:00
for ( Map . Entry < String , String > elt : names_and_exps . entrySet ( ) ) {
String name = elt . getKey ( ) ;
String expStr = elt . getValue ( ) ;
if ( name = = null | | expStr = = null ) throw new IllegalArgumentException ( "Cannot create null expressions : " + name + " " + expStr ) ;
try {
2010-06-07 08:33:02 +08:00
Expression exp = engine . createExpression ( expStr ) ;
2010-02-05 21:33:27 +08:00
exps . add ( new JexlVCMatchExp ( name , exp ) ) ;
2010-01-31 04:51:24 +08:00
} catch ( Exception e ) {
2012-12-19 03:56:48 +08:00
throw new IllegalArgumentException ( "Argument " + name + "has a bad value. Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax." ) ;
2010-01-31 04:51:24 +08:00
}
}
return exps ;
}
2010-02-05 21:33:27 +08:00
/ * *
* Returns true if exp match VC . See collection < > version for full docs .
2010-03-05 05:28:17 +08:00
* @param vc variant context
* @param exp expression
* @return true if there is a match
2010-02-05 21:33:27 +08:00
* /
2010-11-18 00:08:16 +08:00
public static boolean match ( VariantContext vc , JexlVCMatchExp exp ) {
return match ( vc , Arrays . asList ( exp ) ) . get ( exp ) ;
2010-02-02 22:18:46 +08:00
}
2010-02-05 21:33:27 +08:00
/ * *
* Matches each JexlVCMatchExp exp against the data contained in vc , and returns a map from these
* expressions to true ( if they matched ) or false ( if they didn ' t ) . This the best way to apply JEXL
* expressions to VariantContext records . Use initializeMatchExps ( ) to create the list of JexlVCMatchExp
* expressions .
*
2010-03-05 05:28:17 +08:00
* @param vc variant context
* @param exps expressions
* @return true if there is a match
2010-02-05 21:33:27 +08:00
* /
2010-11-18 00:08:16 +08:00
public static Map < JexlVCMatchExp , Boolean > match ( VariantContext vc , Collection < JexlVCMatchExp > exps ) {
return new JEXLMap ( exps , vc ) ;
2010-02-02 22:18:46 +08:00
}
2010-03-23 04:45:11 +08:00
/ * *
2010-07-11 12:30:15 +08:00
* Returns true if exp match VC / g . See collection < > version for full docs .
* @param vc variant context
2010-03-23 04:45:11 +08:00
* @param g genotype
* @param exp expression
* @return true if there is a match
* /
2010-11-18 00:08:16 +08:00
public static boolean match ( VariantContext vc , Genotype g , JexlVCMatchExp exp ) {
return match ( vc , g , Arrays . asList ( exp ) ) . get ( exp ) ;
2010-03-23 04:45:11 +08:00
}
/ * *
2010-07-11 12:30:15 +08:00
* Matches each JexlVCMatchExp exp against the data contained in vc / g , and returns a map from these
2010-03-23 04:45:11 +08:00
* expressions to true ( if they matched ) or false ( if they didn ' t ) . This the best way to apply JEXL
2010-07-11 12:30:15 +08:00
* expressions to VariantContext records / genotypes . Use initializeMatchExps ( ) to create the list of JexlVCMatchExp
2010-03-23 04:45:11 +08:00
* expressions .
*
2010-07-11 12:30:15 +08:00
* @param vc variant context
2010-03-23 04:45:11 +08:00
* @param g genotype
* @param exps expressions
* @return true if there is a match
* /
2010-11-18 00:08:16 +08:00
public static Map < JexlVCMatchExp , Boolean > match ( VariantContext vc , Genotype g , Collection < JexlVCMatchExp > exps ) {
return new JEXLMap ( exps , vc , g ) ;
2010-02-02 22:18:46 +08:00
}
2010-06-05 07:03:55 +08:00
public static double computeHardyWeinbergPvalue ( VariantContext vc ) {
2011-11-19 01:39:10 +08:00
if ( vc . getCalledChrCount ( ) = = 0 )
2010-06-05 07:03:55 +08:00
return 0.0 ;
return HardyWeinbergCalculation . hwCalculate ( vc . getHomRefCount ( ) , vc . getHetCount ( ) , vc . getHomVarCount ( ) ) ;
}
2010-01-26 21:53:29 +08:00
2011-06-08 22:31:46 +08:00
/ * *
* Returns a newly allocated VC that is the same as VC , but without genotypes
2011-07-19 00:25:54 +08:00
* @param vc variant context
* @return new VC without genotypes
2011-06-08 22:31:46 +08:00
* /
@Requires ( "vc != null" )
@Ensures ( "result != null" )
public static VariantContext sitesOnlyVariantContext ( VariantContext vc ) {
2011-11-19 00:06:15 +08:00
return new VariantContextBuilder ( vc ) . noGenotypes ( ) . make ( ) ;
2011-06-08 22:31:46 +08:00
}
/ * *
* Returns a newly allocated list of VC , where each VC is the same as the input VCs , but without genotypes
2011-07-19 00:25:54 +08:00
* @param vcs collection of VCs
* @return new VCs without genotypes
2011-06-08 22:31:46 +08:00
* /
@Requires ( "vcs != null" )
@Ensures ( "result != null" )
public static Collection < VariantContext > sitesOnlyVariantContexts ( Collection < VariantContext > vcs ) {
List < VariantContext > r = new ArrayList < VariantContext > ( ) ;
for ( VariantContext vc : vcs )
r . add ( sitesOnlyVariantContext ( vc ) ) ;
return r ;
}
2011-11-15 03:28:52 +08:00
private final static Map < String , Object > subsetAttributes ( final CommonInfo igc , final Collection < String > keysToPreserve ) {
2011-11-15 02:16:36 +08:00
Map < String , Object > attributes = new HashMap < String , Object > ( keysToPreserve . size ( ) ) ;
for ( final String key : keysToPreserve ) {
if ( igc . hasAttribute ( key ) )
attributes . put ( key , igc . getAttribute ( key ) ) ;
2010-08-06 06:39:51 +08:00
}
2011-11-15 02:16:36 +08:00
return attributes ;
}
2011-11-20 07:40:15 +08:00
/ * *
* @deprecated use variant context builder version instead
2012-07-26 13:50:39 +08:00
* @param vc the variant context
* @param keysToPreserve the keys to preserve
* @return a pruned version of the original variant context
2011-11-20 07:40:15 +08:00
* /
@Deprecated
2011-11-15 02:16:36 +08:00
public static VariantContext pruneVariantContext ( final VariantContext vc , Collection < String > keysToPreserve ) {
2011-11-20 07:40:15 +08:00
return pruneVariantContext ( new VariantContextBuilder ( vc ) , keysToPreserve ) . make ( ) ;
}
public static VariantContextBuilder pruneVariantContext ( final VariantContextBuilder builder , Collection < String > keysToPreserve ) {
final VariantContext vc = builder . make ( ) ;
2011-11-15 02:16:36 +08:00
if ( keysToPreserve = = null ) keysToPreserve = Collections . emptyList ( ) ;
// VC info
final Map < String , Object > attributes = subsetAttributes ( vc . commonInfo , keysToPreserve ) ;
2010-08-06 06:39:51 +08:00
2011-11-15 02:16:36 +08:00
// Genotypes
2011-11-17 05:24:05 +08:00
final GenotypesContext genotypes = GenotypesContext . create ( vc . getNSamples ( ) ) ;
2011-11-15 06:42:55 +08:00
for ( final Genotype g : vc . getGenotypes ( ) ) {
2012-06-10 22:53:51 +08:00
final GenotypeBuilder gb = new GenotypeBuilder ( g ) ;
// remove AD, DP, PL, and all extended attributes, keeping just GT and GQ
gb . noAD ( ) . noDP ( ) . noPL ( ) . noAttributes ( ) ;
genotypes . add ( gb . make ( ) ) ;
2010-08-06 06:39:51 +08:00
}
2011-11-20 07:40:15 +08:00
return builder . genotypes ( genotypes ) . attributes ( attributes ) ;
2010-08-06 06:39:51 +08:00
}
2010-07-09 04:09:48 +08:00
public enum GenotypeMergeType {
2011-08-18 02:19:14 +08:00
/ * *
* Make all sample genotypes unique by file . Each sample shared across RODs gets named sample . ROD .
* /
UNIQUIFY ,
/ * *
* Take genotypes in priority order ( see the priority argument ) .
* /
PRIORITIZE ,
/ * *
* Take the genotypes in any order .
* /
UNSORTED ,
/ * *
* Require that all samples / genotypes be unique between all inputs .
* /
REQUIRE_UNIQUE
2010-07-09 04:09:48 +08:00
}
2011-05-24 09:54:29 +08:00
public enum FilteredRecordMergeType {
2011-08-18 02:19:14 +08:00
/ * *
* Union - leaves the record if any record is unfiltered .
* /
KEEP_IF_ANY_UNFILTERED ,
/ * *
* Requires all records present at site to be unfiltered . VCF files that don ' t contain the record don ' t influence this .
* /
2012-01-20 07:19:58 +08:00
KEEP_IF_ALL_UNFILTERED ,
/ * *
* If any record is present at this site ( regardless of possibly being filtered ) , then all such records are kept and the filters are reset .
* /
KEEP_UNCONDITIONAL
2010-07-01 04:13:03 +08:00
}
2012-01-26 05:10:59 +08:00
public enum MultipleAllelesMergeType {
/ * *
* Combine only alleles of the same type ( SNP , indel , etc . ) into a single VCF record .
* /
BY_TYPE ,
/ * *
* Merge all allele types at the same start position into the same VCF record .
* /
MIX_TYPES
}
2011-07-19 00:25:54 +08:00
/ * *
* Merges VariantContexts into a single hybrid . Takes genotypes for common samples in priority order , if provided .
2012-03-15 00:05:05 +08:00
* If uniquifySamples is true , the priority order is ignored and names are created by concatenating the VC name with
2011-07-19 00:25:54 +08:00
* the sample name
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
* @param annotateOrigin should we annotate the set it came from ?
* @param printMessages should we print messages ?
* @param setKey the key name of the set
* @param filteredAreUncalled are filtered records uncalled ?
* @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count ?
2011-09-21 22:15:05 +08:00
* @return new VariantContext representing the merge of unsortedVCs
2011-07-19 00:25:54 +08:00
* /
2012-12-19 03:56:48 +08:00
public static VariantContext simpleMerge ( final Collection < VariantContext > unsortedVCs ,
2011-09-21 22:15:05 +08:00
final List < String > priorityListOfVCs ,
final FilteredRecordMergeType filteredRecordMergeType ,
final GenotypeMergeType genotypeMergeOptions ,
final boolean annotateOrigin ,
final boolean printMessages ,
final String setKey ,
final boolean filteredAreUncalled ,
final boolean mergeInfoWithMaxAC ) {
2012-12-11 11:23:58 +08:00
int originalNumOfVCs = priorityListOfVCs = = null ? 0 : priorityListOfVCs . size ( ) ;
2012-12-19 03:56:48 +08:00
return simpleMerge ( unsortedVCs , priorityListOfVCs , originalNumOfVCs , filteredRecordMergeType , genotypeMergeOptions , annotateOrigin , printMessages , setKey , filteredAreUncalled , mergeInfoWithMaxAC ) ;
2012-12-11 11:23:58 +08:00
}
/ * *
* Merges VariantContexts into a single hybrid . Takes genotypes for common samples in priority order , if provided .
* If uniquifySamples is true , the priority order is ignored and names are created by concatenating the VC name with
2013-01-26 04:49:51 +08:00
* the sample name .
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions = = GenotypeMergeType . REQUIRE_UNIQUE . One should use
* SampleUtils . verifyUniqueSamplesNames to check that before using sempleMerge .
2012-12-11 11:23:58 +08:00
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
* @param annotateOrigin should we annotate the set it came from ?
* @param printMessages should we print messages ?
* @param setKey the key name of the set
* @param filteredAreUncalled are filtered records uncalled ?
* @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count ?
* @return new VariantContext representing the merge of unsortedVCs
* /
2012-12-19 03:56:48 +08:00
public static VariantContext simpleMerge ( final Collection < VariantContext > unsortedVCs ,
2012-12-11 11:23:58 +08:00
final List < String > priorityListOfVCs ,
final int originalNumOfVCs ,
final FilteredRecordMergeType filteredRecordMergeType ,
final GenotypeMergeType genotypeMergeOptions ,
final boolean annotateOrigin ,
final boolean printMessages ,
final String setKey ,
final boolean filteredAreUncalled ,
final boolean mergeInfoWithMaxAC ) {
2010-07-01 04:13:03 +08:00
if ( unsortedVCs = = null | | unsortedVCs . size ( ) = = 0 )
2010-06-05 07:03:55 +08:00
return null ;
2010-01-26 21:53:29 +08:00
2012-12-11 11:23:58 +08:00
if ( priorityListOfVCs ! = null & & originalNumOfVCs ! = priorityListOfVCs . size ( ) )
throw new IllegalArgumentException ( "the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list" ) ;
if ( annotateOrigin & & priorityListOfVCs = = null & & originalNumOfVCs = = 0 )
throw new IllegalArgumentException ( "Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts" ) ;
2010-07-10 04:18:37 +08:00
2012-07-26 13:50:39 +08:00
final List < VariantContext > preFilteredVCs = sortVariantContextsByPriority ( unsortedVCs , priorityListOfVCs , genotypeMergeOptions ) ;
2010-07-22 10:36:45 +08:00
// Make sure all variant contexts are padded with reference base in case of indels if necessary
2012-03-15 00:05:05 +08:00
final List < VariantContext > VCs = new ArrayList < VariantContext > ( ) ;
2010-07-22 10:36:45 +08:00
2012-07-26 13:50:39 +08:00
for ( final VariantContext vc : preFilteredVCs ) {
2010-07-30 22:16:43 +08:00
if ( ! filteredAreUncalled | | vc . isNotFiltered ( ) )
2012-07-26 13:50:39 +08:00
VCs . add ( vc ) ;
2010-07-22 10:36:45 +08:00
}
2011-07-19 01:42:45 +08:00
if ( VCs . size ( ) = = 0 ) // everything is filtered out and we're filteredAreUncalled
2010-07-30 22:16:43 +08:00
return null ;
2010-01-26 21:53:29 +08:00
2010-06-05 07:03:55 +08:00
// establish the baseline info from the first VC
2011-09-21 22:15:05 +08:00
final VariantContext first = VCs . get ( 0 ) ;
final String name = first . getSource ( ) ;
final Allele refAllele = determineReferenceAllele ( VCs ) ;
2010-07-01 04:13:03 +08:00
2011-10-09 08:39:53 +08:00
final Set < Allele > alleles = new LinkedHashSet < Allele > ( ) ;
2012-08-16 02:36:06 +08:00
final Set < String > filters = new HashSet < String > ( ) ;
2012-08-16 09:12:21 +08:00
final Map < String , Object > attributes = new LinkedHashMap < String , Object > ( ) ;
2011-09-21 22:15:05 +08:00
final Set < String > inconsistentAttributes = new HashSet < String > ( ) ;
final Set < String > variantSources = new HashSet < String > ( ) ; // contains the set of sources we found in our set of VCs that are variant
final Set < String > rsIDs = new LinkedHashSet < String > ( 1 ) ; // most of the time there's one id
2012-12-19 03:56:48 +08:00
VariantContext longestVC = first ;
2010-06-05 07:03:55 +08:00
int depth = 0 ;
2011-03-16 05:28:10 +08:00
int maxAC = - 1 ;
2012-08-16 09:12:21 +08:00
final Map < String , Object > attributesWithMaxAC = new LinkedHashMap < String , Object > ( ) ;
2012-08-19 22:29:38 +08:00
double log10PError = CommonInfo . NO_LOG10_PERROR ;
2011-06-13 04:10:16 +08:00
VariantContext vcWithMaxAC = null ;
2011-11-17 05:24:05 +08:00
GenotypesContext genotypes = GenotypesContext . create ( ) ;
2010-07-01 04:13:03 +08:00
2010-10-02 05:08:55 +08:00
// counting the number of filtered and variant VCs
2011-09-21 22:15:05 +08:00
int nFiltered = 0 ;
2010-01-26 21:53:29 +08:00
2010-07-02 21:32:33 +08:00
boolean remapped = false ;
2010-06-05 07:03:55 +08:00
// cycle through and add info from the other VCs, making sure the loc/reference matches
2010-07-02 21:32:33 +08:00
2012-03-15 00:05:05 +08:00
for ( final VariantContext vc : VCs ) {
2012-12-19 03:56:48 +08:00
if ( longestVC . getStart ( ) ! = vc . getStart ( ) )
throw new IllegalStateException ( "BUG: attempting to merge VariantContexts with different start sites: first=" + first . toString ( ) + " second=" + vc . toString ( ) ) ;
2010-07-01 04:13:03 +08:00
2012-12-19 03:56:48 +08:00
if ( getSize ( vc ) > getSize ( longestVC ) )
longestVC = vc ; // get the longest location
2010-07-02 21:32:33 +08:00
2010-07-01 04:13:03 +08:00
nFiltered + = vc . isFiltered ( ) ? 1 : 0 ;
2011-09-21 22:15:05 +08:00
if ( vc . isVariant ( ) ) variantSources . add ( vc . getSource ( ) ) ;
2010-01-26 21:53:29 +08:00
2010-07-02 21:32:33 +08:00
AlleleMapper alleleMapping = resolveIncompatibleAlleles ( refAllele , vc , alleles ) ;
remapped = remapped | | alleleMapping . needsRemapping ( ) ;
alleles . addAll ( alleleMapping . values ( ) ) ;
2010-07-01 04:13:03 +08:00
2011-11-22 21:40:48 +08:00
mergeGenotypes ( genotypes , vc , alleleMapping , genotypeMergeOptions = = GenotypeMergeType . UNIQUIFY ) ;
2010-03-05 05:28:17 +08:00
2012-08-19 22:29:38 +08:00
// We always take the QUAL of the first VC with a non-MISSING qual for the combined value
if ( log10PError = = CommonInfo . NO_LOG10_PERROR )
log10PError = vc . getLog10PError ( ) ;
2010-03-05 05:28:17 +08:00
2010-06-05 07:03:55 +08:00
filters . addAll ( vc . getFilters ( ) ) ;
2010-07-23 21:33:11 +08:00
//
// add attributes
//
// special case DP (add it up) and ID (just preserve it)
//
2011-04-06 13:36:52 +08:00
if ( vc . hasAttribute ( VCFConstants . DEPTH_KEY ) )
2011-09-03 00:27:11 +08:00
depth + = vc . getAttributeAsInt ( VCFConstants . DEPTH_KEY , 0 ) ;
2011-11-16 03:56:33 +08:00
if ( vc . hasID ( ) ) rsIDs . add ( vc . getID ( ) ) ;
2011-04-06 13:36:52 +08:00
if ( mergeInfoWithMaxAC & & vc . hasAttribute ( VCFConstants . ALLELE_COUNT_KEY ) ) {
2011-09-03 00:27:11 +08:00
String rawAlleleCounts = vc . getAttributeAsString ( VCFConstants . ALLELE_COUNT_KEY , null ) ;
2011-04-06 13:36:52 +08:00
// lets see if the string contains a , separator
if ( rawAlleleCounts . contains ( VCFConstants . INFO_FIELD_ARRAY_SEPARATOR ) ) {
List < String > alleleCountArray = Arrays . asList ( rawAlleleCounts . substring ( 1 , rawAlleleCounts . length ( ) - 1 ) . split ( VCFConstants . INFO_FIELD_ARRAY_SEPARATOR ) ) ;
for ( String alleleCount : alleleCountArray ) {
final int ac = Integer . valueOf ( alleleCount . trim ( ) ) ;
if ( ac > maxAC ) {
maxAC = ac ;
2011-06-13 04:10:16 +08:00
vcWithMaxAC = vc ;
2011-04-06 13:36:52 +08:00
}
}
} else {
final int ac = Integer . valueOf ( rawAlleleCounts ) ;
if ( ac > maxAC ) {
maxAC = ac ;
2011-06-13 04:10:16 +08:00
vcWithMaxAC = vc ;
2011-04-06 13:36:52 +08:00
}
}
2011-03-16 05:28:10 +08:00
}
2010-07-10 04:18:37 +08:00
2012-03-15 00:05:05 +08:00
for ( final Map . Entry < String , Object > p : vc . getAttributes ( ) . entrySet ( ) ) {
2010-07-23 21:33:11 +08:00
String key = p . getKey ( ) ;
// if we don't like the key already, don't go anywhere
if ( ! inconsistentAttributes . contains ( key ) ) {
2012-04-28 04:21:02 +08:00
final boolean alreadyFound = attributes . containsKey ( key ) ;
final Object boundValue = attributes . get ( key ) ;
2012-03-15 00:05:05 +08:00
final boolean boundIsMissingValue = alreadyFound & & boundValue . equals ( VCFConstants . MISSING_VALUE_v4 ) ;
2010-07-23 21:33:11 +08:00
if ( alreadyFound & & ! boundValue . equals ( p . getValue ( ) ) & & ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
//System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue());
inconsistentAttributes . add ( key ) ;
attributes . remove ( key ) ;
} else if ( ! alreadyFound | | boundIsMissingValue ) { // no value
//if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue());
attributes . put ( key , p . getValue ( ) ) ;
}
2010-07-10 04:18:37 +08:00
}
}
2010-06-05 07:03:55 +08:00
}
2010-03-05 05:28:17 +08:00
2011-10-09 08:39:53 +08:00
// if we have more alternate alleles in the merged VC than in one or more of the
2012-10-07 10:39:49 +08:00
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
2012-03-15 00:05:05 +08:00
for ( final VariantContext vc : VCs ) {
2011-09-24 09:58:20 +08:00
if ( vc . alleles . size ( ) = = 1 )
continue ;
2011-10-09 08:39:53 +08:00
if ( hasPLIncompatibleAlleles ( alleles , vc . alleles ) ) {
2013-01-09 03:45:50 +08:00
if ( GeneralUtils . DEBUG_MODE_ENABLED & & ! genotypes . isEmpty ( ) ) {
System . err . println ( String . format ( "Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s" ,
vc . getChr ( ) , vc . getStart ( ) , vc . getEnd ( ) , alleles , vc . alleles ) ) ;
}
2012-10-07 11:02:36 +08:00
genotypes = stripPLsAndAD ( genotypes ) ;
2011-10-22 02:07:20 +08:00
// this will remove stale AC,AF attributed from vc
calculateChromosomeCounts ( vc , attributes , true ) ;
2011-08-09 04:25:35 +08:00
break ;
}
}
2011-06-13 04:10:16 +08:00
// take the VC with the maxAC and pull the attributes into a modifiable map
if ( mergeInfoWithMaxAC & & vcWithMaxAC ! = null ) {
attributesWithMaxAC . putAll ( vcWithMaxAC . getAttributes ( ) ) ;
}
2010-07-01 04:13:03 +08:00
// if at least one record was unfiltered and we want a union, clear all of the filters
2012-01-20 07:19:58 +08:00
if ( ( filteredRecordMergeType = = FilteredRecordMergeType . KEEP_IF_ANY_UNFILTERED & & nFiltered ! = VCs . size ( ) ) | | filteredRecordMergeType = = FilteredRecordMergeType . KEEP_UNCONDITIONAL )
2010-07-01 04:13:03 +08:00
filters . clear ( ) ;
2011-09-23 05:05:12 +08:00
2011-09-21 22:15:05 +08:00
if ( annotateOrigin ) { // we care about where the call came from
2010-07-11 15:19:16 +08:00
String setValue ;
2012-12-11 11:23:58 +08:00
if ( nFiltered = = 0 & & variantSources . size ( ) = = originalNumOfVCs ) // nothing was unfiltered
2011-09-23 05:05:12 +08:00
setValue = MERGE_INTERSECTION ;
2010-07-01 04:13:03 +08:00
else if ( nFiltered = = VCs . size ( ) ) // everything was filtered out
2011-09-23 05:05:12 +08:00
setValue = MERGE_FILTER_IN_ALL ;
2012-03-15 00:05:05 +08:00
else if ( variantSources . isEmpty ( ) ) // everyone was reference
2011-09-23 05:05:12 +08:00
setValue = MERGE_REF_IN_ALL ;
2011-09-21 22:15:05 +08:00
else {
2012-03-15 00:05:05 +08:00
final LinkedHashSet < String > s = new LinkedHashSet < String > ( ) ;
for ( final VariantContext vc : VCs )
2010-07-24 07:09:23 +08:00
if ( vc . isVariant ( ) )
2011-09-23 05:05:12 +08:00
s . add ( vc . isFiltered ( ) ? MERGE_FILTER_PREFIX + vc . getSource ( ) : vc . getSource ( ) ) ;
2013-01-09 03:45:50 +08:00
setValue = GeneralUtils . join ( "-" , s ) ;
2010-07-01 04:13:03 +08:00
}
2010-11-18 00:08:16 +08:00
2011-06-10 22:35:48 +08:00
if ( setKey ! = null ) {
attributes . put ( setKey , setValue ) ;
2012-07-08 23:41:07 +08:00
if ( mergeInfoWithMaxAC & & vcWithMaxAC ! = null ) {
attributesWithMaxAC . put ( setKey , setValue ) ;
}
2011-06-10 22:35:48 +08:00
}
2010-07-01 04:13:03 +08:00
}
2010-06-05 07:03:55 +08:00
if ( depth > 0 )
2010-07-11 15:19:16 +08:00
attributes . put ( VCFConstants . DEPTH_KEY , String . valueOf ( depth ) ) ;
2011-09-21 22:15:05 +08:00
2013-01-09 03:45:50 +08:00
final String ID = rsIDs . isEmpty ( ) ? VCFConstants . EMPTY_ID_FIELD : GeneralUtils . join ( "," , rsIDs ) ;
2010-07-02 21:32:33 +08:00
2011-11-19 10:07:30 +08:00
final VariantContextBuilder builder = new VariantContextBuilder ( ) . source ( name ) . id ( ID ) ;
2012-12-19 03:56:48 +08:00
builder . loc ( longestVC . getChr ( ) , longestVC . getStart ( ) , longestVC . getEnd ( ) ) ;
2011-11-19 10:07:30 +08:00
builder . alleles ( alleles ) ;
builder . genotypes ( genotypes ) ;
builder . log10PError ( log10PError ) ;
2012-08-16 02:36:06 +08:00
builder . filters ( filters . isEmpty ( ) ? filters : new TreeSet < String > ( filters ) ) ;
2012-08-16 09:12:21 +08:00
builder . attributes ( new TreeMap < String , Object > ( mergeInfoWithMaxAC ? attributesWithMaxAC : attributes ) ) ;
2010-11-25 12:52:12 +08:00
2011-11-19 10:07:30 +08:00
// Trim the padded bases of all alleles if necessary
2012-07-26 13:50:39 +08:00
final VariantContext merged = builder . make ( ) ;
2010-07-03 04:09:25 +08:00
if ( printMessages & & remapped ) System . out . printf ( "Remapped => %s%n" , merged ) ;
2010-07-02 21:32:33 +08:00
return merged ;
2010-06-05 07:03:55 +08:00
}
2010-07-01 04:13:03 +08:00
2011-10-10 02:45:55 +08:00
private static final boolean hasPLIncompatibleAlleles ( final Collection < Allele > alleleSet1 , final Collection < Allele > alleleSet2 ) {
2011-10-09 08:39:53 +08:00
final Iterator < Allele > it1 = alleleSet1 . iterator ( ) ;
final Iterator < Allele > it2 = alleleSet2 . iterator ( ) ;
while ( it1 . hasNext ( ) & & it2 . hasNext ( ) ) {
final Allele a1 = it1 . next ( ) ;
final Allele a2 = it2 . next ( ) ;
if ( ! a1 . equals ( a2 ) )
return true ;
}
// by this point, at least one of the iterators is empty. All of the elements
// we've compared are equal up until this point. But it's possible that the
// sets aren't the same size, which is indicated by the test below. If they
// are of the same size, though, the sets are compatible
return it1 . hasNext ( ) | | it2 . hasNext ( ) ;
}
2011-09-15 22:22:28 +08:00
public static boolean allelesAreSubset ( VariantContext vc1 , VariantContext vc2 ) {
// if all alleles of vc1 are a contained in alleles of vc2, return true
if ( ! vc1 . getReference ( ) . equals ( vc2 . getReference ( ) ) )
return false ;
for ( Allele a : vc1 . getAlternateAlleles ( ) ) {
if ( ! vc2 . getAlternateAlleles ( ) . contains ( a ) )
return false ;
}
return true ;
}
2011-11-19 10:07:30 +08:00
2012-10-07 11:02:36 +08:00
public static GenotypesContext stripPLsAndAD ( GenotypesContext genotypes ) {
2011-11-17 05:24:05 +08:00
GenotypesContext newGs = GenotypesContext . create ( genotypes . size ( ) ) ;
2011-08-09 04:25:35 +08:00
2011-11-15 06:42:55 +08:00
for ( final Genotype g : genotypes ) {
2012-10-07 11:02:36 +08:00
newGs . add ( removePLsAndAD ( g ) ) ;
2011-08-09 04:25:35 +08:00
}
return newGs ;
}
2011-07-19 01:42:45 +08:00
public static Map < VariantContext . Type , List < VariantContext > > separateVariantContextsByType ( Collection < VariantContext > VCs ) {
HashMap < VariantContext . Type , List < VariantContext > > mappedVCs = new HashMap < VariantContext . Type , List < VariantContext > > ( ) ;
for ( VariantContext vc : VCs ) {
2011-09-25 01:40:11 +08:00
// look at previous variant contexts of different type. If:
2013-01-10 04:30:46 +08:00
// a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list
2011-09-25 01:40:11 +08:00
// b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list)
// c) neither: do nothing, just add vc to its own list
boolean addtoOwnList = true ;
for ( VariantContext . Type type : VariantContext . Type . values ( ) ) {
if ( type . equals ( vc . getType ( ) ) )
continue ;
if ( ! mappedVCs . containsKey ( type ) )
continue ;
List < VariantContext > vcList = mappedVCs . get ( type ) ;
for ( int k = 0 ; k < vcList . size ( ) ; k + + ) {
VariantContext otherVC = vcList . get ( k ) ;
if ( allelesAreSubset ( otherVC , vc ) ) {
// otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list
vcList . remove ( k ) ;
// avoid having empty lists
if ( vcList . size ( ) = = 0 )
2012-08-17 02:00:48 +08:00
mappedVCs . remove ( type ) ;
2011-09-25 01:40:11 +08:00
if ( ! mappedVCs . containsKey ( vc . getType ( ) ) )
mappedVCs . put ( vc . getType ( ) , new ArrayList < VariantContext > ( ) ) ;
mappedVCs . get ( vc . getType ( ) ) . add ( otherVC ) ;
break ;
}
else if ( allelesAreSubset ( vc , otherVC ) ) {
// vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own
mappedVCs . get ( type ) . add ( vc ) ;
addtoOwnList = false ;
2011-09-25 07:08:00 +08:00
break ;
2011-09-25 01:40:11 +08:00
}
}
}
if ( addtoOwnList ) {
if ( ! mappedVCs . containsKey ( vc . getType ( ) ) )
mappedVCs . put ( vc . getType ( ) , new ArrayList < VariantContext > ( ) ) ;
mappedVCs . get ( vc . getType ( ) ) . add ( vc ) ;
2012-07-28 03:48:40 +08:00
}
2011-07-19 01:42:45 +08:00
}
return mappedVCs ;
}
2010-07-02 21:32:33 +08:00
private static class AlleleMapper {
private VariantContext vc = null ;
private Map < Allele , Allele > map = null ;
public AlleleMapper ( VariantContext vc ) { this . vc = vc ; }
public AlleleMapper ( Map < Allele , Allele > map ) { this . map = map ; }
public boolean needsRemapping ( ) { return this . map ! = null ; }
public Collection < Allele > values ( ) { return map ! = null ? map . values ( ) : vc . getAlleles ( ) ; }
public Allele remap ( Allele a ) { return map ! = null & & map . containsKey ( a ) ? map . get ( a ) : a ; }
public List < Allele > remap ( List < Allele > as ) {
List < Allele > newAs = new ArrayList < Allele > ( ) ;
for ( Allele a : as ) {
//System.out.printf(" Remapping %s => %s%n", a, remap(a));
newAs . add ( remap ( a ) ) ;
}
return newAs ;
}
}
2013-01-26 04:49:51 +08:00
// TODO: remove that after testing
// static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) {
// Set<String> names = new HashSet<String>();
// for ( VariantContext vc : unsortedVCs ) {
// for ( String name : vc.getSampleNames() ) {
// //System.out.printf("Checking %s %b%n", name, names.contains(name));
// if ( names.contains(name) )
// throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name);
// }
//
// names.addAll(vc.getSampleNames());
// }
// }
2010-07-10 04:18:37 +08:00
2010-07-02 21:32:33 +08:00
static private Allele determineReferenceAllele ( List < VariantContext > VCs ) {
Allele ref = null ;
for ( VariantContext vc : VCs ) {
Allele myRef = vc . getReference ( ) ;
if ( ref = = null | | ref . length ( ) < myRef . length ( ) )
ref = myRef ;
else if ( ref . length ( ) = = myRef . length ( ) & & ! ref . equals ( myRef ) )
2012-12-19 03:56:48 +08:00
throw new TribbleException ( String . format ( "The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s" , vc . getChr ( ) , vc . getStart ( ) , ref , myRef ) ) ;
2010-07-02 21:32:33 +08:00
}
return ref ;
}
static private AlleleMapper resolveIncompatibleAlleles ( Allele refAllele , VariantContext vc , Set < Allele > allAlleles ) {
if ( refAllele . equals ( vc . getReference ( ) ) )
return new AlleleMapper ( vc ) ;
else {
// we really need to do some work. The refAllele is the longest reference allele seen at this
// start site. So imagine it is:
//
// refAllele: ACGTGA
// myRef: ACGT
2012-07-28 03:48:40 +08:00
// myAlt: A
2010-07-02 21:32:33 +08:00
//
// We need to remap all of the alleles in vc to include the extra GA so that
2012-07-28 03:48:40 +08:00
// myRef => refAllele and myAlt => AGA
2010-07-02 21:32:33 +08:00
//
Allele myRef = vc . getReference ( ) ;
2012-12-19 03:56:48 +08:00
if ( refAllele . length ( ) < = myRef . length ( ) ) throw new IllegalStateException ( "BUG: myRef=" + myRef + " is longer than refAllele=" + refAllele ) ;
2010-07-02 21:32:33 +08:00
byte [ ] extraBases = Arrays . copyOfRange ( refAllele . getBases ( ) , myRef . length ( ) , refAllele . length ( ) ) ;
// System.out.printf("Remapping allele at %s%n", vc);
// System.out.printf("ref %s%n", refAllele);
// System.out.printf("myref %s%n", myRef );
// System.out.printf("extrabases %s%n", new String(extraBases));
Map < Allele , Allele > map = new HashMap < Allele , Allele > ( ) ;
for ( Allele a : vc . getAlleles ( ) ) {
if ( a . isReference ( ) )
map . put ( a , refAllele ) ;
else {
Allele extended = Allele . extend ( a , extraBases ) ;
for ( Allele b : allAlleles )
if ( extended . equals ( b ) )
extended = b ;
// System.out.printf(" Extending %s => %s%n", a, extended);
map . put ( a , extended ) ;
}
}
// debugging
// System.out.printf("mapping %s%n", map);
return new AlleleMapper ( map ) ;
}
}
2010-07-19 00:29:59 +08:00
static class CompareByPriority implements Comparator < VariantContext > , Serializable {
2010-07-01 04:13:03 +08:00
List < String > priorityListOfVCs ;
public CompareByPriority ( List < String > priorityListOfVCs ) {
this . priorityListOfVCs = priorityListOfVCs ;
}
private int getIndex ( VariantContext vc ) {
2010-10-27 10:21:41 +08:00
int i = priorityListOfVCs . indexOf ( vc . getSource ( ) ) ;
2012-12-19 03:56:48 +08:00
if ( i = = - 1 ) throw new IllegalArgumentException ( "Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc . getSource ( ) ) ;
2010-07-01 04:13:03 +08:00
return i ;
}
public int compare ( VariantContext vc1 , VariantContext vc2 ) {
2010-07-19 00:29:59 +08:00
return Integer . valueOf ( getIndex ( vc1 ) ) . compareTo ( getIndex ( vc2 ) ) ;
2010-07-01 04:13:03 +08:00
}
}
2010-07-09 04:09:48 +08:00
public static List < VariantContext > sortVariantContextsByPriority ( Collection < VariantContext > unsortedVCs , List < String > priorityListOfVCs , GenotypeMergeType mergeOption ) {
if ( mergeOption = = GenotypeMergeType . PRIORITIZE & & priorityListOfVCs = = null )
2010-07-01 04:13:03 +08:00
throw new IllegalArgumentException ( "Cannot merge calls by priority with a null priority list" ) ;
2012-12-10 12:40:03 +08:00
if ( priorityListOfVCs = = null | | mergeOption = = GenotypeMergeType . UNSORTED )
2010-07-01 04:13:03 +08:00
return new ArrayList < VariantContext > ( unsortedVCs ) ;
else {
ArrayList < VariantContext > sorted = new ArrayList < VariantContext > ( unsortedVCs ) ;
Collections . sort ( sorted , new CompareByPriority ( priorityListOfVCs ) ) ;
return sorted ;
}
}
2011-11-22 21:40:48 +08:00
private static void mergeGenotypes ( GenotypesContext mergedGenotypes , VariantContext oneVC , AlleleMapper alleleMapping , boolean uniqifySamples ) {
2013-01-26 00:47:30 +08:00
//TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE
2011-11-15 06:42:55 +08:00
for ( Genotype g : oneVC . getGenotypes ( ) ) {
2010-10-27 10:21:41 +08:00
String name = mergedSampleName ( oneVC . getSource ( ) , g . getSampleName ( ) , uniqifySamples ) ;
2011-11-22 23:16:36 +08:00
if ( ! mergedGenotypes . containsSample ( name ) ) {
2010-07-01 04:13:03 +08:00
// only add if the name is new
2010-07-02 21:32:33 +08:00
Genotype newG = g ;
if ( uniqifySamples | | alleleMapping . needsRemapping ( ) ) {
2011-11-15 02:16:36 +08:00
final List < Allele > alleles = alleleMapping . needsRemapping ( ) ? alleleMapping . remap ( g . getAlleles ( ) ) : g . getAlleles ( ) ;
2012-06-02 07:25:11 +08:00
newG = new GenotypeBuilder ( g ) . name ( name ) . alleles ( alleles ) . make ( ) ;
2010-07-02 21:32:33 +08:00
}
2011-11-15 06:42:55 +08:00
mergedGenotypes . add ( newG ) ;
2010-07-01 04:13:03 +08:00
}
}
}
public static String mergedSampleName ( String trackName , String sampleName , boolean uniqify ) {
return uniqify ? sampleName + "." + trackName : sampleName ;
}
2010-07-12 21:57:39 +08:00
2010-10-26 04:03:03 +08:00
/ * *
* Returns a context identical to this with the REF and ALT alleles reverse complemented .
*
* @param vc variant context
* @return new vc
* /
public static VariantContext reverseComplement ( VariantContext vc ) {
// create a mapping from original allele to reverse complemented allele
HashMap < Allele , Allele > alleleMap = new HashMap < Allele , Allele > ( vc . getAlleles ( ) . size ( ) ) ;
for ( Allele originalAllele : vc . getAlleles ( ) ) {
Allele newAllele ;
2012-07-26 13:50:39 +08:00
if ( originalAllele . isNoCall ( ) )
2010-10-26 04:03:03 +08:00
newAllele = originalAllele ;
else
newAllele = Allele . create ( BaseUtils . simpleReverseComplement ( originalAllele . getBases ( ) ) , originalAllele . isReference ( ) ) ;
alleleMap . put ( originalAllele , newAllele ) ;
}
// create new Genotype objects
2011-11-17 05:24:05 +08:00
GenotypesContext newGenotypes = GenotypesContext . create ( vc . getNSamples ( ) ) ;
2011-11-15 06:42:55 +08:00
for ( final Genotype genotype : vc . getGenotypes ( ) ) {
2010-10-26 04:03:03 +08:00
List < Allele > newAlleles = new ArrayList < Allele > ( ) ;
2011-11-15 06:42:55 +08:00
for ( Allele allele : genotype . getAlleles ( ) ) {
2010-10-26 04:03:03 +08:00
Allele newAllele = alleleMap . get ( allele ) ;
if ( newAllele = = null )
newAllele = Allele . NO_CALL ;
newAlleles . add ( newAllele ) ;
}
2012-06-02 07:25:11 +08:00
newGenotypes . add ( new GenotypeBuilder ( genotype ) . alleles ( newAlleles ) . make ( ) ) ;
2010-10-26 04:03:03 +08:00
}
2011-11-19 10:07:30 +08:00
return new VariantContextBuilder ( vc ) . alleles ( alleleMap . values ( ) ) . genotypes ( newGenotypes ) . make ( ) ;
2010-10-26 04:03:03 +08:00
}
2010-07-14 12:56:58 +08:00
public static VariantContext purgeUnallowedGenotypeAttributes ( VariantContext vc , Set < String > allowedAttributes ) {
if ( allowedAttributes = = null )
return vc ;
2011-11-17 05:24:05 +08:00
GenotypesContext newGenotypes = GenotypesContext . create ( vc . getNSamples ( ) ) ;
2011-11-15 06:42:55 +08:00
for ( final Genotype genotype : vc . getGenotypes ( ) ) {
2010-07-14 12:56:58 +08:00
Map < String , Object > attrs = new HashMap < String , Object > ( ) ;
2012-06-02 07:25:11 +08:00
for ( Map . Entry < String , Object > attr : genotype . getExtendedAttributes ( ) . entrySet ( ) ) {
2010-07-14 12:56:58 +08:00
if ( allowedAttributes . contains ( attr . getKey ( ) ) )
attrs . put ( attr . getKey ( ) , attr . getValue ( ) ) ;
}
2012-06-02 07:25:11 +08:00
newGenotypes . add ( new GenotypeBuilder ( genotype ) . attributes ( attrs ) . make ( ) ) ;
2010-07-14 12:56:58 +08:00
}
2011-11-19 00:06:15 +08:00
return new VariantContextBuilder ( vc ) . genotypes ( newGenotypes ) . make ( ) ;
2010-07-14 12:56:58 +08:00
}
2010-08-06 02:47:53 +08:00
public static BaseUtils . BaseSubstitutionType getSNPSubstitutionType ( VariantContext context ) {
if ( ! context . isSNP ( ) | | ! context . isBiallelic ( ) )
throw new IllegalStateException ( "Requested SNP substitution type for bialleic non-SNP " + context ) ;
return BaseUtils . SNPSubstitutionType ( context . getReference ( ) . getBases ( ) [ 0 ] , context . getAlternateAllele ( 0 ) . getBases ( ) [ 0 ] ) ;
}
/ * *
* If this is a BiAlleic SNP , is it a transition ?
* /
public static boolean isTransition ( VariantContext context ) {
return getSNPSubstitutionType ( context ) = = BaseUtils . BaseSubstitutionType . TRANSITION ;
}
/ * *
* If this is a BiAlleic SNP , is it a transversion ?
* /
public static boolean isTransversion ( VariantContext context ) {
return getSNPSubstitutionType ( context ) = = BaseUtils . BaseSubstitutionType . TRANSVERSION ;
}
2012-01-26 08:42:55 +08:00
public static boolean isTransition ( Allele ref , Allele alt ) {
return BaseUtils . SNPSubstitutionType ( ref . getBases ( ) [ 0 ] , alt . getBases ( ) [ 0 ] ) = = BaseUtils . BaseSubstitutionType . TRANSITION ;
}
public static boolean isTransversion ( Allele ref , Allele alt ) {
return BaseUtils . SNPSubstitutionType ( ref . getBases ( ) [ 0 ] , alt . getBases ( ) [ 0 ] ) = = BaseUtils . BaseSubstitutionType . TRANSVERSION ;
}
2012-12-19 03:56:48 +08:00
public static int getSize ( VariantContext vc ) {
return vc . getEnd ( ) - vc . getStart ( ) + 1 ;
2010-08-06 02:47:53 +08:00
}
2011-11-21 07:26:56 +08:00
public static final Set < String > genotypeNames ( final Collection < Genotype > genotypes ) {
final Set < String > names = new HashSet < String > ( genotypes . size ( ) ) ;
for ( final Genotype g : genotypes )
names . add ( g . getSampleName ( ) ) ;
return names ;
}
2012-03-29 23:07:37 +08:00
2012-06-30 01:55:52 +08:00
/ * *
* Compute the end position for this VariantContext from the alleles themselves
*
* In the trivial case this is a single BP event and end = start ( open intervals )
* In general the end is start + ref length - 1 , handling the case where ref length = = 0
* However , if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases
*
* @param alleles the list of alleles to consider . The reference allele must be the first one
* @param start the known start position of this event
* @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic . Can be - 1
* if no is expected but will throw an error if one is found
* @return this builder
* /
@Requires ( { "! alleles.isEmpty()" , "start > 0" , "endForSymbolicAlleles == -1 || endForSymbolicAlleles > 0" } )
public static int computeEndFromAlleles ( final List < Allele > alleles , final int start , final int endForSymbolicAlleles ) {
final Allele ref = alleles . get ( 0 ) ;
if ( ref . isNonReference ( ) )
2012-12-19 03:56:48 +08:00
throw new IllegalStateException ( "computeEndFromAlleles requires first allele to be reference" ) ;
2012-06-30 01:55:52 +08:00
if ( VariantContext . hasSymbolicAlleles ( alleles ) ) {
if ( endForSymbolicAlleles = = - 1 )
2012-12-19 03:56:48 +08:00
throw new IllegalStateException ( "computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided" ) ;
2012-06-30 01:55:52 +08:00
return endForSymbolicAlleles ;
} else {
return start + Math . max ( ref . length ( ) - 1 , 0 ) ;
}
}
2012-07-27 12:47:15 +08:00
2012-07-28 03:48:40 +08:00
public static boolean requiresPaddingBase ( final List < String > alleles ) {
// see whether one of the alleles would be null if trimmed through
for ( final String allele : alleles ) {
if ( allele . isEmpty ( ) )
return true ;
}
int clipping = 0 ;
Character currentBase = null ;
while ( true ) {
for ( final String allele : alleles ) {
if ( allele . length ( ) - clipping = = 0 )
return true ;
char myBase = allele . charAt ( clipping ) ;
if ( currentBase = = null )
currentBase = myBase ;
else if ( currentBase ! = myBase )
return false ;
}
clipping + + ;
currentBase = null ;
}
}
2011-11-16 05:14:50 +08:00
}