diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java index 7054d78cd..2ddadc683 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java @@ -124,6 +124,11 @@ public class SimulateReadsForVariants extends RodWalker { */ @Argument(fullName="readLength", shortName="RL", doc="Read lengths (bp)", required=false, minValue = 1, maxValue = Integer.MAX_VALUE) public int readLength = 101; + /** + * Use this argument to simulate events at a non-50/50 allele fraction represented in the VCF by AF (used for somatic event simulation) + */ + @Argument(fullName="useAFAsAlleleFraction", shortName="AF", doc="Use AF in VCF as event allele fraction ", required=false) + public boolean useAFAsAlleleFraction = false; /** * The corresponding platform identifier will be specified in the simulated read group PL tag. This setting does not * affect the properties of the simulated reads. @@ -237,7 +242,7 @@ public class SimulateReadsForVariants extends RodWalker { if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); - generateReadsForVariant(vc, ref); + generateReadsForVariant(vc, ref, useAFAsAlleleFraction); return 1; } @@ -289,19 +294,21 @@ public class SimulateReadsForVariants extends RodWalker { * * @param vc the (bi-allelic) variant context for which to generate artificial reads * @param ref the original reference context + * @param useAFAsAlleleFraction use AF tag to indicate allele fraction */ - private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref) { + private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, boolean useAFAsAlleleFraction) { final int refLength = vc.getReference().getBases().length; final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); int gi = 0; + double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5f; for ( final Genotype g : vc.getGenotypes() ) { final int myDepth = sampleDepth(); for ( int d = 0; d < myDepth; d++ ) { - final ArtificialHaplotype haplotype = chooseRefHaplotype(g) ? refHap : altHap; + final ArtificialHaplotype haplotype = chooseRefHaplotype(g, refAlleleFraction) ? refHap : altHap; final byte[] readBases = Arrays.copyOf(haplotype.bases, readLength); addMachineErrors(readBases, errorRate); @@ -314,12 +321,14 @@ public class SimulateReadsForVariants extends RodWalker { * Decides whether or not to choose the reference haplotype, depending on the given genotype * * @param g the genotype of the given sample + * @param pReferenceGivenHet probability of choosing reference for hets + * * @return true if one should use the reference haplotype, false otherwise */ - private boolean chooseRefHaplotype(final Genotype g) { + private boolean chooseRefHaplotype(final Genotype g, final double pReferenceGivenHet) { final double refP; if ( g.isHomRef() ) refP = 1; - else if ( g.isHet() ) refP = 0.5; + else if ( g.isHet() ) refP = pReferenceGivenHet; else refP = 0.0; return ran.nextDouble() < refP; diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java index 2ae904e65..2d0550dfb 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java @@ -92,4 +92,17 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest { Arrays.asList("26db391f223ead74d786006a502029d8")); executeTest("testPlatformTag", spec); } + + + @Test + public void testAlleleFraction() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SimulateReadsForVariants --no_pg_tag --useAFAsAlleleFraction -DP 100 -R " + b37KGReference + " -V " + publicTestDir + "forAlleleFractionSimulation.vcf -o %s", + 1, + Arrays.asList("3425c807525dff71310d1517e00a4f7e")); + executeTest("testAlleleFraction", spec); + + } + } diff --git a/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf new file mode 100644 index 000000000..e29e49df5 --- /dev/null +++ b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf @@ -0,0 +1,29 @@ +##fileformat=VCFv4.1 +##phasing=none +##INDIVIDUAL=TRUTH +##SAMPLE= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR +20 100000 . A T 100 PASS SOMATIC;AF=0.500 GT 0/1 +20 101000 . T C 100 PASS SOMATIC;AF=0.400 GT 0/1 +20 102000 . A C 100 PASS SOMATIC;AF=0.300 GT 0/1 +20 103000 . C T 100 PASS SOMATIC;AF=0.200 GT 0/1 +20 104000 . A C 100 PASS SOMATIC;AF=0.100 GT 0/1 +20 105000 . A C 100 PASS SOMATIC;AF=0.050 GT 0/1 +20 105500 . A G 100 PASS SOMATIC;AF=0.020 GT 0/1 +20 106000 . CAT C 100 PASS SOMATIC;AF=0.500 GT 0/1 +20 107000 . ACT A 100 PASS SOMATIC;AF=0.400 GT 0/1 +20 107994 . AAG A 100 PASS SOMATIC;AF=0.300 GT 0/1 +20 108999 . GCA G 100 PASS SOMATIC;AF=0.200 GT 0/1 +20 110009 . CAT C 100 PASS SOMATIC;AF=0.100 GT 0/1 +20 112000 . ACC A 100 PASS SOMATIC;AF=0.050 GT 0/1 +20 112500 . AGC A 100 PASS SOMATIC;AF=0.020 GT 0/1 +20 113000 . G GAAT 100 PASS SOMATIC;AF=0.500 GT 0/1 +20 113999 . G GTGA 100 PASS SOMATIC;AF=0.400 GT 0/1 +20 115000 . T TCCA 100 PASS SOMATIC;AF=0.300 GT 0/1 +20 116000 . G GTTT 100 PASS SOMATIC;AF=0.200 GT 0/1 +20 117000 . T TCCA 100 PASS SOMATIC;AF=0.100 GT 0/1 +20 118000 . C CTGT 100 PASS SOMATIC;AF=0.050 GT 0/1 +20 119000 . C CTGT 100 PASS SOMATIC;AF=0.020 GT 0/1 \ No newline at end of file diff --git a/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf.idx b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf.idx new file mode 100644 index 000000000..8b82f26ab Binary files /dev/null and b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf.idx differ