From 7115cadbd8f30d424fb89cd404aaba8c012c4f83 Mon Sep 17 00:00:00 2001 From: Kristian Cibulskis Date: Wed, 16 Apr 2014 15:45:36 -0400 Subject: [PATCH] extended SimulateReadsForVariants to optionally use the AF field to indicate allele fraction of the simulated event, useful in cancer and other variable ploidy use cases --- .../SimulateReadsForVariants.java | 19 +++++++++--- ...mulateReadsForVariantsIntegrationTest.java | 13 ++++++++ .../resources/forAlleleFractionSimulation.vcf | 29 ++++++++++++++++++ .../forAlleleFractionSimulation.vcf.idx | Bin 0 -> 2074 bytes 4 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf create mode 100644 public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf.idx diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java index 7054d78cd..2ddadc683 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariants.java @@ -124,6 +124,11 @@ public class SimulateReadsForVariants extends RodWalker { */ @Argument(fullName="readLength", shortName="RL", doc="Read lengths (bp)", required=false, minValue = 1, maxValue = Integer.MAX_VALUE) public int readLength = 101; + /** + * Use this argument to simulate events at a non-50/50 allele fraction represented in the VCF by AF (used for somatic event simulation) + */ + @Argument(fullName="useAFAsAlleleFraction", shortName="AF", doc="Use AF in VCF as event allele fraction ", required=false) + public boolean useAFAsAlleleFraction = false; /** * The corresponding platform identifier will be specified in the simulated read group PL tag. This setting does not * affect the properties of the simulated reads. @@ -237,7 +242,7 @@ public class SimulateReadsForVariants extends RodWalker { if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); - generateReadsForVariant(vc, ref); + generateReadsForVariant(vc, ref, useAFAsAlleleFraction); return 1; } @@ -289,19 +294,21 @@ public class SimulateReadsForVariants extends RodWalker { * * @param vc the (bi-allelic) variant context for which to generate artificial reads * @param ref the original reference context + * @param useAFAsAlleleFraction use AF tag to indicate allele fraction */ - private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref) { + private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, boolean useAFAsAlleleFraction) { final int refLength = vc.getReference().getBases().length; final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); int gi = 0; + double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5f; for ( final Genotype g : vc.getGenotypes() ) { final int myDepth = sampleDepth(); for ( int d = 0; d < myDepth; d++ ) { - final ArtificialHaplotype haplotype = chooseRefHaplotype(g) ? refHap : altHap; + final ArtificialHaplotype haplotype = chooseRefHaplotype(g, refAlleleFraction) ? refHap : altHap; final byte[] readBases = Arrays.copyOf(haplotype.bases, readLength); addMachineErrors(readBases, errorRate); @@ -314,12 +321,14 @@ public class SimulateReadsForVariants extends RodWalker { * Decides whether or not to choose the reference haplotype, depending on the given genotype * * @param g the genotype of the given sample + * @param pReferenceGivenHet probability of choosing reference for hets + * * @return true if one should use the reference haplotype, false otherwise */ - private boolean chooseRefHaplotype(final Genotype g) { + private boolean chooseRefHaplotype(final Genotype g, final double pReferenceGivenHet) { final double refP; if ( g.isHomRef() ) refP = 1; - else if ( g.isHet() ) refP = 0.5; + else if ( g.isHet() ) refP = pReferenceGivenHet; else refP = 0.0; return ran.nextDouble() < refP; diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java index 2ae904e65..2d0550dfb 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java @@ -92,4 +92,17 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest { Arrays.asList("26db391f223ead74d786006a502029d8")); executeTest("testPlatformTag", spec); } + + + @Test + public void testAlleleFraction() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SimulateReadsForVariants --no_pg_tag --useAFAsAlleleFraction -DP 100 -R " + b37KGReference + " -V " + publicTestDir + "forAlleleFractionSimulation.vcf -o %s", + 1, + Arrays.asList("3425c807525dff71310d1517e00a4f7e")); + executeTest("testAlleleFraction", spec); + + } + } diff --git a/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf new file mode 100644 index 000000000..e29e49df5 --- /dev/null +++ b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf @@ -0,0 +1,29 @@ +##fileformat=VCFv4.1 +##phasing=none +##INDIVIDUAL=TRUTH +##SAMPLE= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR +20 100000 . A T 100 PASS SOMATIC;AF=0.500 GT 0/1 +20 101000 . T C 100 PASS SOMATIC;AF=0.400 GT 0/1 +20 102000 . A C 100 PASS SOMATIC;AF=0.300 GT 0/1 +20 103000 . C T 100 PASS SOMATIC;AF=0.200 GT 0/1 +20 104000 . A C 100 PASS SOMATIC;AF=0.100 GT 0/1 +20 105000 . A C 100 PASS SOMATIC;AF=0.050 GT 0/1 +20 105500 . A G 100 PASS SOMATIC;AF=0.020 GT 0/1 +20 106000 . CAT C 100 PASS SOMATIC;AF=0.500 GT 0/1 +20 107000 . ACT A 100 PASS SOMATIC;AF=0.400 GT 0/1 +20 107994 . AAG A 100 PASS SOMATIC;AF=0.300 GT 0/1 +20 108999 . GCA G 100 PASS SOMATIC;AF=0.200 GT 0/1 +20 110009 . CAT C 100 PASS SOMATIC;AF=0.100 GT 0/1 +20 112000 . ACC A 100 PASS SOMATIC;AF=0.050 GT 0/1 +20 112500 . AGC A 100 PASS SOMATIC;AF=0.020 GT 0/1 +20 113000 . G GAAT 100 PASS SOMATIC;AF=0.500 GT 0/1 +20 113999 . G GTGA 100 PASS SOMATIC;AF=0.400 GT 0/1 +20 115000 . T TCCA 100 PASS SOMATIC;AF=0.300 GT 0/1 +20 116000 . G GTTT 100 PASS SOMATIC;AF=0.200 GT 0/1 +20 117000 . T TCCA 100 PASS SOMATIC;AF=0.100 GT 0/1 +20 118000 . C CTGT 100 PASS SOMATIC;AF=0.050 GT 0/1 +20 119000 . C CTGT 100 PASS SOMATIC;AF=0.020 GT 0/1 \ No newline at end of file diff --git a/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf.idx b/public/gatk-framework/src/test/resources/forAlleleFractionSimulation.vcf.idx new file mode 100644 index 0000000000000000000000000000000000000000..8b82f26ab38a104d4532b5cb3d5289399810bd6f GIT binary patch literal 2074 zcmZ8iTW%Xi5S+xw5poQZ{g_whPY@YLfXx67W!PVYqbPx3I}Y^7nc{C>B!F+@uT;;> zIy+v7;-Xkv(_Q`8)*C_H}4+azP`VC|L5lKtKY-N?;wcl-%lU@+4%u}y~5|m{n_#I z`Tq2De|U2I>GW)Wb_lZ3>5}|4?U&>3wA;T3w6UDYxKyQLVmY|LX%T&K_Wa`PaK3lm zq^SQZp8tG#I0oUcyEs4oz(Jx(qPzeCK-o--c7_JFN&=KR5CJf!C?$6w1+IuGNgj+G zxG*S`OJ|h8$yjU7L$n5tS#yMLXBYq$R-D}droft##uA4Jvm-<~@i>?Th}Oi_(s&Bd zGJdyO>Oc@K6^)kSh>j62Nm5oDheOXuQ!HtO$0bx|OM)o6O*NECrAS<4Eg&PQR>vbk zAuD7PZrUR-G*%Y-5*t!YtQ1oo;lG3{NeZRKw^z`F?_UVdj)8N^)&-wDB_g4+fw1He zdIu!~gx@rx8>l&IFn%qd3{(n=u|}C1n(L&it^kK6uDO~Vji}5TCD+x=i$gtz=;5N! zOPvSW6f!EgQebP`!etb*b4gvHM4NJpp0QT6;YtCm+|WepEZ92Nc;;5;TA6)_v7s2P zwXSxq>4cJi8;NQ5*A*h6zb1vw&1oH(kVh7oXyQh%MPy9yq|mvha~AG;6b#>*8{M<0 z8fV=WgC?1E+i99@;%aNNRF?_vwa})UY}O^uxUw$8(zsX}Kn*Q9SF@p#>)J`|6L6U_ zLSCmK9t4IA)t_M3z+~|5wV%=qXRg_+1C!oRHx** z9a#+3m>g_Y70WjJ-O{O8QEP4NK|iu$+084%+lgzgY1UrgJT(+G2W!EeT}6T>Zz34O zb@FU2Kuqjh9q~v6ua@QIG>r8lvA*LH;Nkp4R5D@HqZ`og{su<3gwG*-i(7x&`}KYQ Kn*J|u?*9Pl#jW!I literal 0 HcmV?d00001