Do not process a variant if it is too large (> readLength), and log an error

remove final keyword before refMap and altMap, constructHaplotype() changes their values

return ArtificialHaplotype from constructHaplotype instaed of passing as an argument

Add logic so arraycopy does not throw an IndexOutOfBoundsException, add test for a long insert
This commit is contained in:
Ron Levine 2014-10-14 17:08:48 -04:00
parent 5fa5724e4a
commit 239151ac7b
4 changed files with 97 additions and 12 deletions

View File

@ -51,6 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.simulatereads;
import org.apache.log4j.Logger;
import cern.jet.random.Poisson;
import cern.jet.random.engine.MersenneTwister;
import htsjdk.samtools.SAMFileHeader;
@ -108,6 +109,7 @@ import java.util.*;
@Reference(window=@Window(start=-200,stop=200))
public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
private static Logger logger = Logger.getLogger(SimulateReadsForVariants.class);
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
@ -248,20 +250,21 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
if ( vc == null || !vc.isBiallelic() )
return 0;
if ( verbose ) logger.info(String.format("Generating reads for %s", vc));
if ( !generateReadsForVariant(vc, ref, useAFAsAlleleFraction) )
return 0;
generateReadsForVariant(vc, ref, useAFAsAlleleFraction);
if ( verbose ) logger.info(String.format("Generating reads for %s", vc));
return 1;
}
/**
* Contstructs an artifical haplotype given an allele and original reference context
* Constructs an artificial haplotype given an allele and original reference context
*
* @param allele the allele to model (can be reference)
* @param refLength the length of the reference allele
* @param ref the original reference context
* @return a non-null ArtificialHaplotype
* @return the artificial haplotype or null if the readLength parameter is too small to hold the allele and reference
*/
private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) {
@ -269,24 +272,63 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
final int alleleLength = allele.getBases().length;
final int halfAlleleLength = (alleleLength + 1) / 2;
final int refContextLength = ref.getBases().length;
// this is how far back to move from the event to start copying bases
final int offset = halfReadLength - halfAlleleLength;
// number of bases copied to the haplotype
int copiedCount = 0;
// copy bases before the event
final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart());
int posOnRefContext = locusPosOnRefContext - offset;
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset);
int copiedCount = offset;
if ( offset >= 0 && posOnRefContext >= 0 && posOnRefContext + offset <= refContextLength )
{
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset);
copiedCount = offset;
}
else
{
String msg = new String("Can not copy reference bases to haplotype: ");
if ( offset < 0 )
msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ")";
else
msg += "Reference position(" + posOnRefContext + ") < 0";
logger.info(msg);
return null;
}
// copy the event bases
System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength);
copiedCount += alleleLength;
if ( copiedCount + alleleLength <= readLength )
{
System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength);
copiedCount += alleleLength;
}
else
{
String msg = new String("Can not copy allele bases to haplotype: ");
msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ") + copied count(" + copiedCount + ")";
logger.info(msg);
return null;
}
// copy bases after the event
posOnRefContext = locusPosOnRefContext + refLength;
final int remainder = readLength - copiedCount;
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder);
if ( remainder > 0 && posOnRefContext + remainder <= refContextLength )
{
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder);
copiedCount += remainder;
}
else
{
String msg = new String("Can not copy remaining reference bases to haplotype: ");
msg += "Read length(" + readLength + ") <= Copied count(" + copiedCount + ")";
logger.info(msg);
return null;
}
final String cigar;
if ( refLength == alleleLength )
@ -303,12 +345,17 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
* @param vc the (bi-allelic) variant context for which to generate artificial reads
* @param ref the original reference context
* @param useAFAsAlleleFraction use AF tag to indicate allele fraction
* @return true if successful generation of artificial reads for the variant, false otherwise
*/
private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) {
private boolean generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) {
final int refLength = vc.getReference().getBases().length;
final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref);
final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref);
final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref);
if ( refHap == null )
return false;
final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref);
if ( altHap == null )
return false;
final double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5;
@ -328,6 +375,8 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0);
}
}
return true;
}
/**

View File

@ -110,4 +110,24 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest {
}
@Test
public void testLongInsertFailure() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s",
1,
Arrays.asList("bb412c1fc8f95523dd2fc623d53dbeec"));
executeTest("testLongInsertFailure", spec);
}
@Test
public void testLongInsertSuccess() {
WalkerTestSpec spec = new WalkerTestSpec(
"-RL 269 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s",
1,
Arrays.asList("9236320c470cd8d6759c21b79206f63f"));
executeTest("testLongInsertSuccess", spec);
}
}

View File

@ -0,0 +1,16 @@
##fileformat=VCFv4.1
##fileDate=20140820
##source=mutatrix population genome simulator
##seed=100
##reference=ref.fasta
##phasing=true
##commandline=mutatrix ref.fasta --file-prefix mutations/ --sample-prefix 0 -n 1 --ploidy 2 --random-seed 100 --snp-rate 0.001
##INFO=<ID=AC,Number=A,Type=Integer,Description="Alternate allele count">
##INFO=<ID=TYPE,Number=A,Type=String,Description="Type of each allele (snp, ins, del, mnp, complex)">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples at the site">
##INFO=<ID=NA,Number=1,Type=Integer,Description="Number of alternate alleles">
##INFO=<ID=LEN,Number=A,Type=Integer,Description="Length of each alternate allele">
##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description="Generated at a sequence repeat loci">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 01
20 10000000 . T TGTCTTAGTGTCGTATGGGCTTCCATATTCAACTTTCTTCTATAAGTAGAATATCTACACGTGATGCTCTGGTCTTTCTACTACACGTCTATTGTAGCTTAATCTTTTCCTCGGGGGGGAAGAGGCTGTTTAGAATCATACCTCCAACCGACATTAACCCTGTTGGATTATAACTAGGGGCAAATCCGGATATTGGTATACGGCCCTATTATTCTATGGGTCTACGCACCTAGTAGGCACCCACGCAGTCTCTGGCCCGACCCCA 99 . AC=2;LEN=264;NA=1;NS=1;TYPE=ins GT 1|1