diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java index ccae8669e..af08f7fbb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariants.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.simulatereads; +import org.apache.log4j.Logger; import cern.jet.random.Poisson; import cern.jet.random.engine.MersenneTwister; import htsjdk.samtools.SAMFileHeader; @@ -108,6 +109,7 @@ import java.util.*; @Reference(window=@Window(start=-200,stop=200)) public class SimulateReadsForVariants extends RodWalker { + private static Logger logger = Logger.getLogger(SimulateReadsForVariants.class); @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -248,20 +250,21 @@ public class SimulateReadsForVariants extends RodWalker { if ( vc == null || !vc.isBiallelic() ) return 0; - if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); + if ( !generateReadsForVariant(vc, ref, useAFAsAlleleFraction) ) + return 0; - generateReadsForVariant(vc, ref, useAFAsAlleleFraction); + if ( verbose ) logger.info(String.format("Generating reads for %s", vc)); return 1; } /** - * Contstructs an artifical haplotype given an allele and original reference context + * Constructs an artificial haplotype given an allele and original reference context * * @param allele the allele to model (can be reference) * @param refLength the length of the reference allele * @param ref the original reference context - * @return a non-null ArtificialHaplotype + * @return the artificial haplotype or null if the readLength parameter is too small to hold the allele and reference */ private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) { @@ -269,24 +272,63 @@ public class SimulateReadsForVariants extends RodWalker { final int alleleLength = allele.getBases().length; final int halfAlleleLength = (alleleLength + 1) / 2; + final int refContextLength = ref.getBases().length; // this is how far back to move from the event to start copying bases final int offset = halfReadLength - halfAlleleLength; + // number of bases copied to the haplotype + int copiedCount = 0; + // copy bases before the event final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart()); int posOnRefContext = locusPosOnRefContext - offset; - System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset); - int copiedCount = offset; + if ( offset >= 0 && posOnRefContext >= 0 && posOnRefContext + offset <= refContextLength ) + { + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset); + copiedCount = offset; + } + else + { + String msg = new String("Can not copy reference bases to haplotype: "); + if ( offset < 0 ) + msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ")"; + else + msg += "Reference position(" + posOnRefContext + ") < 0"; + logger.info(msg); + return null; + } // copy the event bases - System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength); - copiedCount += alleleLength; + if ( copiedCount + alleleLength <= readLength ) + { + System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength); + copiedCount += alleleLength; + } + else + { + String msg = new String("Can not copy allele bases to haplotype: "); + msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ") + copied count(" + copiedCount + ")"; + logger.info(msg); + return null; + } + // copy bases after the event posOnRefContext = locusPosOnRefContext + refLength; final int remainder = readLength - copiedCount; - System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder); + if ( remainder > 0 && posOnRefContext + remainder <= refContextLength ) + { + System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder); + copiedCount += remainder; + } + else + { + String msg = new String("Can not copy remaining reference bases to haplotype: "); + msg += "Read length(" + readLength + ") <= Copied count(" + copiedCount + ")"; + logger.info(msg); + return null; + } final String cigar; if ( refLength == alleleLength ) @@ -303,12 +345,17 @@ public class SimulateReadsForVariants extends RodWalker { * @param vc the (bi-allelic) variant context for which to generate artificial reads * @param ref the original reference context * @param useAFAsAlleleFraction use AF tag to indicate allele fraction + * @return true if successful generation of artificial reads for the variant, false otherwise */ - private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) { + private boolean generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) { final int refLength = vc.getReference().getBases().length; - final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); - final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); + final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref); + if ( refHap == null ) + return false; + final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref); + if ( altHap == null ) + return false; final double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5; @@ -328,6 +375,8 @@ public class SimulateReadsForVariants extends RodWalker { writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0); } } + + return true; } /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java index 628491595..d4eb17c29 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/simulatereads/SimulateReadsForVariantsIntegrationTest.java @@ -110,4 +110,24 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest { } + @Test + public void testLongInsertFailure() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s", + 1, + Arrays.asList("bb412c1fc8f95523dd2fc623d53dbeec")); + executeTest("testLongInsertFailure", spec); + } + + @Test + public void testLongInsertSuccess() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-RL 269 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s", + 1, + Arrays.asList("9236320c470cd8d6759c21b79206f63f")); + executeTest("testLongInsertSuccess", spec); + } + } diff --git a/public/gatk-engine/src/test/resources/forLongInsert.vcf b/public/gatk-engine/src/test/resources/forLongInsert.vcf new file mode 100644 index 000000000..3566b880a --- /dev/null +++ b/public/gatk-engine/src/test/resources/forLongInsert.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.1 +##fileDate=20140820 +##source=mutatrix population genome simulator +##seed=100 +##reference=ref.fasta +##phasing=true +##commandline=mutatrix ref.fasta --file-prefix mutations/ --sample-prefix 0 -n 1 --ploidy 2 --random-seed 100 --snp-rate 0.001 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 01 +20 10000000 . T TGTCTTAGTGTCGTATGGGCTTCCATATTCAACTTTCTTCTATAAGTAGAATATCTACACGTGATGCTCTGGTCTTTCTACTACACGTCTATTGTAGCTTAATCTTTTCCTCGGGGGGGAAGAGGCTGTTTAGAATCATACCTCCAACCGACATTAACCCTGTTGGATTATAACTAGGGGCAAATCCGGATATTGGTATACGGCCCTATTATTCTATGGGTCTACGCACCTAGTAGGCACCCACGCAGTCTCTGGCCCGACCCCA 99 . AC=2;LEN=264;NA=1;NS=1;TYPE=ins GT 1|1 diff --git a/public/gatk-engine/src/test/resources/forLongInsert.vcf.idx b/public/gatk-engine/src/test/resources/forLongInsert.vcf.idx new file mode 100644 index 000000000..a22e93200 Binary files /dev/null and b/public/gatk-engine/src/test/resources/forLongInsert.vcf.idx differ