extended SimulateReadsForVariants to optionally use the AF field to indicate allele fraction of the simulated event, useful in cancer and other variable ploidy use cases

This commit is contained in:
Kristian Cibulskis 2014-04-16 15:45:36 -04:00
parent 34ece31f4a
commit 7115cadbd8
4 changed files with 56 additions and 5 deletions

View File

@ -124,6 +124,11 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
*/
@Argument(fullName="readLength", shortName="RL", doc="Read lengths (bp)", required=false, minValue = 1, maxValue = Integer.MAX_VALUE)
public int readLength = 101;
/**
* Use this argument to simulate events at a non-50/50 allele fraction represented in the VCF by AF (used for somatic event simulation)
*/
@Argument(fullName="useAFAsAlleleFraction", shortName="AF", doc="Use AF in VCF as event allele fraction ", required=false)
public boolean useAFAsAlleleFraction = false;
/**
* The corresponding platform identifier will be specified in the simulated read group PL tag. This setting does not
* affect the properties of the simulated reads.
@ -237,7 +242,7 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
if ( verbose ) logger.info(String.format("Generating reads for %s", vc));
generateReadsForVariant(vc, ref);
generateReadsForVariant(vc, ref, useAFAsAlleleFraction);
return 1;
}
@ -289,19 +294,21 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
*
* @param vc the (bi-allelic) variant context for which to generate artificial reads
* @param ref the original reference context
* @param useAFAsAlleleFraction use AF tag to indicate allele fraction
*/
private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref) {
private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, boolean useAFAsAlleleFraction) {
final int refLength = vc.getReference().getBases().length;
final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref);
final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref);
int gi = 0;
double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5f;
for ( final Genotype g : vc.getGenotypes() ) {
final int myDepth = sampleDepth();
for ( int d = 0; d < myDepth; d++ ) {
final ArtificialHaplotype haplotype = chooseRefHaplotype(g) ? refHap : altHap;
final ArtificialHaplotype haplotype = chooseRefHaplotype(g, refAlleleFraction) ? refHap : altHap;
final byte[] readBases = Arrays.copyOf(haplotype.bases, readLength);
addMachineErrors(readBases, errorRate);
@ -314,12 +321,14 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
* Decides whether or not to choose the reference haplotype, depending on the given genotype
*
* @param g the genotype of the given sample
* @param pReferenceGivenHet probability of choosing reference for hets
*
* @return true if one should use the reference haplotype, false otherwise
*/
private boolean chooseRefHaplotype(final Genotype g) {
private boolean chooseRefHaplotype(final Genotype g, final double pReferenceGivenHet) {
final double refP;
if ( g.isHomRef() ) refP = 1;
else if ( g.isHet() ) refP = 0.5;
else if ( g.isHet() ) refP = pReferenceGivenHet;
else refP = 0.0;
return ran.nextDouble() < refP;

View File

@ -92,4 +92,17 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest {
Arrays.asList("26db391f223ead74d786006a502029d8"));
executeTest("testPlatformTag", spec);
}
@Test
public void testAlleleFraction() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SimulateReadsForVariants --no_pg_tag --useAFAsAlleleFraction -DP 100 -R " + b37KGReference + " -V " + publicTestDir + "forAlleleFractionSimulation.vcf -o %s",
1,
Arrays.asList("3425c807525dff71310d1517e00a4f7e"));
executeTest("testAlleleFraction", spec);
}
}

View File

@ -0,0 +1,29 @@
##fileformat=VCFv4.1
##phasing=none
##INDIVIDUAL=TRUTH
##SAMPLE=<ID=TRUTH,Individual="TRUTH",Description="bamsurgeon spike-in">
##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic mutation in primary">
##INFO=<ID=AF,Number=1,Type=Float,Description="Variant Allele Frequency">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR
20 100000 . A T 100 PASS SOMATIC;AF=0.500 GT 0/1
20 101000 . T C 100 PASS SOMATIC;AF=0.400 GT 0/1
20 102000 . A C 100 PASS SOMATIC;AF=0.300 GT 0/1
20 103000 . C T 100 PASS SOMATIC;AF=0.200 GT 0/1
20 104000 . A C 100 PASS SOMATIC;AF=0.100 GT 0/1
20 105000 . A C 100 PASS SOMATIC;AF=0.050 GT 0/1
20 105500 . A G 100 PASS SOMATIC;AF=0.020 GT 0/1
20 106000 . CAT C 100 PASS SOMATIC;AF=0.500 GT 0/1
20 107000 . ACT A 100 PASS SOMATIC;AF=0.400 GT 0/1
20 107994 . AAG A 100 PASS SOMATIC;AF=0.300 GT 0/1
20 108999 . GCA G 100 PASS SOMATIC;AF=0.200 GT 0/1
20 110009 . CAT C 100 PASS SOMATIC;AF=0.100 GT 0/1
20 112000 . ACC A 100 PASS SOMATIC;AF=0.050 GT 0/1
20 112500 . AGC A 100 PASS SOMATIC;AF=0.020 GT 0/1
20 113000 . G GAAT 100 PASS SOMATIC;AF=0.500 GT 0/1
20 113999 . G GTGA 100 PASS SOMATIC;AF=0.400 GT 0/1
20 115000 . T TCCA 100 PASS SOMATIC;AF=0.300 GT 0/1
20 116000 . G GTTT 100 PASS SOMATIC;AF=0.200 GT 0/1
20 117000 . T TCCA 100 PASS SOMATIC;AF=0.100 GT 0/1
20 118000 . C CTGT 100 PASS SOMATIC;AF=0.050 GT 0/1
20 119000 . C CTGT 100 PASS SOMATIC;AF=0.020 GT 0/1