From 239151ac7b8506c2cb897cca115e2fa99e1fd4c1 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 14 Oct 2014 17:08:48 -0400 Subject: [PATCH] Do not process a variant if it is too large (> readLength), and log an error remove final keyword before refMap and altMap, constructHaplotype() changes their values return ArtificialHaplotype from constructHaplotype instaed of passing as an argument Add logic so arraycopy does not throw an IndexOutOfBoundsException, add test for a long insert --- .../SimulateReadsForVariants.java | 73 +++++++++++++++--- ...mulateReadsForVariantsIntegrationTest.java | 20 +++++ .../src/test/resources/forLongInsert.vcf | 16 ++++ .../src/test/resources/forLongInsert.vcf.idx | Bin 0 -> 2019 bytes 4 files changed, 97 insertions(+), 12 deletions(-) create mode 100644 public/gatk-engine/src/test/resources/forLongInsert.vcf create mode 100644 public/gatk-engine/src/test/resources/forLongInsert.vcf.idx diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java index ccae8669e..af08f7fbb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.simulatereads; +import org.apache.log4j.Logger; import cern.jet.random.Poisson; import cern.jet.random.engine.MersenneTwister; import htsjdk.samtools.SAMFileHeader; @@ -108,6 +109,7 @@ import java.util.*; @Reference(window=@Window(start=-200,stop=200)) public class SimulateReadsForVariants extends RodWalker { + private static Logger logger = Logger.getLogger(SimulateReadsForVariants.class); @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -248,20 +250,21 @@ public class SimulateReadsForVariants extends RodWalker { if ( vc == null || !vc.isBiallelic() ) return 0; - if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); + if ( !generateReadsForVariant(vc, ref, useAFAsAlleleFraction) ) + return 0; - generateReadsForVariant(vc, ref, useAFAsAlleleFraction); + if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); return 1; } /** - * Contstructs an artifical haplotype given an allele and original reference context + * Constructs an artificial haplotype given an allele and original reference context * * @param allele the allele to model (can be reference) * @param refLength the length of the reference allele * @param ref the original reference context - * @return a non-null ArtificialHaplotype + * @return the artificial haplotype or null if the readLength parameter is too small to hold the allele and reference */ private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) { @@ -269,24 +272,63 @@ public class SimulateReadsForVariants extends RodWalker { final int alleleLength = allele.getBases().length; final int halfAlleleLength = (alleleLength + 1) / 2; + final int refContextLength = ref.getBases().length; // this is how far back to move from the event to start copying bases final int offset = halfReadLength - halfAlleleLength; + // number of bases copied to the haplotype + int copiedCount = 0; + // copy bases before the event final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart()); int posOnRefContext = locusPosOnRefContext - offset; - System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset); - int copiedCount = offset; + if ( offset >= 0 && posOnRefContext >= 0 && posOnRefContext + offset <= refContextLength ) + { + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset); + copiedCount = offset; + } + else + { + String msg = new String("Can not copy reference bases to haplotype: "); + if ( offset < 0 ) + msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ")"; + else + msg += "Reference position(" + posOnRefContext + ") < 0"; + logger.info(msg); + return null; + } // copy the event bases - System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength); - copiedCount += alleleLength; + if ( copiedCount + alleleLength <= readLength ) + { + System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength); + copiedCount += alleleLength; + } + else + { + String msg = new String("Can not copy allele bases to haplotype: "); + msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ") + copied count(" + copiedCount + ")"; + logger.info(msg); + return null; + } + // copy bases after the event posOnRefContext = locusPosOnRefContext + refLength; final int remainder = readLength - copiedCount; - System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder); + if ( remainder > 0 && posOnRefContext + remainder <= refContextLength ) + { + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder); + copiedCount += remainder; + } + else + { + String msg = new String("Can not copy remaining reference bases to haplotype: "); + msg += "Read length(" + readLength + ") <= Copied count(" + copiedCount + ")"; + logger.info(msg); + return null; + } final String cigar; if ( refLength == alleleLength ) @@ -303,12 +345,17 @@ public class SimulateReadsForVariants extends RodWalker { * @param vc the (bi-allelic) variant context for which to generate artificial reads * @param ref the original reference context * @param useAFAsAlleleFraction use AF tag to indicate allele fraction + * @return true if successful generation of artificial reads for the variant, false otherwise */ - private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) { + private boolean generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) { final int refLength = vc.getReference().getBases().length; - final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); - final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); + final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); + if ( refHap == null ) + return false; + final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); + if ( altHap == null ) + return false; final double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5; @@ -328,6 +375,8 @@ public class SimulateReadsForVariants extends RodWalker { writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0); } } + + return true; } /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java index 628491595..d4eb17c29 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java @@ -110,4 +110,24 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest { } + @Test + public void testLongInsertFailure() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s", + 1, + Arrays.asList("bb412c1fc8f95523dd2fc623d53dbeec")); + executeTest("testLongInsertFailure", spec); + } + + @Test + public void testLongInsertSuccess() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-RL 269 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s", + 1, + Arrays.asList("9236320c470cd8d6759c21b79206f63f")); + executeTest("testLongInsertSuccess", spec); + } + } diff --git a/public/gatk-engine/src/test/resources/forLongInsert.vcf b/public/gatk-engine/src/test/resources/forLongInsert.vcf new file mode 100644 index 000000000..3566b880a --- /dev/null +++ b/public/gatk-engine/src/test/resources/forLongInsert.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.1 +##fileDate=20140820 +##source=mutatrix population genome simulator +##seed=100 +##reference=ref.fasta +##phasing=true +##commandline=mutatrix ref.fasta --file-prefix mutations/ --sample-prefix 0 -n 1 --ploidy 2 --random-seed 100 --snp-rate 0.001 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 01 +20 10000000 . T TGTCTTAGTGTCGTATGGGCTTCCATATTCAACTTTCTTCTATAAGTAGAATATCTACACGTGATGCTCTGGTCTTTCTACTACACGTCTATTGTAGCTTAATCTTTTCCTCGGGGGGGAAGAGGCTGTTTAGAATCATACCTCCAACCGACATTAACCCTGTTGGATTATAACTAGGGGCAAATCCGGATATTGGTATACGGCCCTATTATTCTATGGGTCTACGCACCTAGTAGGCACCCACGCAGTCTCTGGCCCGACCCCA 99 . AC=2;LEN=264;NA=1;NS=1;TYPE=ins GT 1|1 diff --git a/public/gatk-engine/src/test/resources/forLongInsert.vcf.idx b/public/gatk-engine/src/test/resources/forLongInsert.vcf.idx new file mode 100644 index 0000000000000000000000000000000000000000..a22e93200ff1b6fb1cf57807998585200e0d4e1b GIT binary patch literal 2019 zcmZ8i*>2oM5S$zzk{__x$6TE^LE1PA$O_=B4dqFY1zJ&}tb*N@k(c~eKF>7=1x^U; zfnu|!y87U5dwKJXbM8BQrr)3cnx~ij<3H2kY5wxKpQnf8-J6&F@pSj;X`cRm`SkR7 zKRw)?{(Lje;%^pXT+wSGAGj5PQZx;#!UGg}a-Xf|b+GNmMbp*4q$BwOx} z2)U@B4fs1hJl&Mt=)~C-m1^nYP#%*2;v6GS5UqtS`0jY&!C`Skp0!;<^_M-}*FP zJoBo4u^SnBj4}xN=A4W$UzP zDJF5mo*R7T;s>L<$Hw4#B-(qK^zKMKvh1cI9j;j>VZZi7xYV1Ih>g)@bN~lG`r2S0 z@{l~&Ba@*TgM-zoWYI>yYdRS#YR!#3SdPqCcH_z@?Zi3eFmf+&92$zMgE?W(u9Tq3 zD+$JMojh9=h>0DmBSnee)v~;ty0PU*tZxs2<@JfaaPD9CExwnx>GBKc^;%xLUjdw4 BqjmrQ literal 0 HcmV?d00001