Merge pull request #754 from broadinstitute/rhl_variant_array_exception
Do not process a variant if it is too large (> readLength), and log an e...
This commit is contained in:
commit
c4fcd70a88
|
|
@ -51,6 +51,7 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools.walkers.simulatereads;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import cern.jet.random.Poisson;
|
||||
import cern.jet.random.engine.MersenneTwister;
|
||||
import htsjdk.samtools.SAMFileHeader;
|
||||
|
|
@ -108,6 +109,7 @@ import java.util.*;
|
|||
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
|
||||
private static Logger logger = Logger.getLogger(SimulateReadsForVariants.class);
|
||||
|
||||
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
/**
|
||||
|
|
@ -248,20 +250,21 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
|
|||
if ( vc == null || !vc.isBiallelic() )
|
||||
return 0;
|
||||
|
||||
if ( verbose ) logger.info(String.format("Generating reads for %s", vc));
|
||||
if ( !generateReadsForVariant(vc, ref, useAFAsAlleleFraction) )
|
||||
return 0;
|
||||
|
||||
generateReadsForVariant(vc, ref, useAFAsAlleleFraction);
|
||||
if ( verbose ) logger.info(String.format("Generating reads for %s", vc));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Contstructs an artifical haplotype given an allele and original reference context
|
||||
* Constructs an artificial haplotype given an allele and original reference context
|
||||
*
|
||||
* @param allele the allele to model (can be reference)
|
||||
* @param refLength the length of the reference allele
|
||||
* @param ref the original reference context
|
||||
* @return a non-null ArtificialHaplotype
|
||||
* @return the artificial haplotype or null if the readLength parameter is too small to hold the allele and reference
|
||||
*/
|
||||
private ArtificialHaplotype constructHaplotype(final Allele allele, final int refLength, final ReferenceContext ref) {
|
||||
|
||||
|
|
@ -269,24 +272,63 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
final int alleleLength = allele.getBases().length;
|
||||
final int halfAlleleLength = (alleleLength + 1) / 2;
|
||||
final int refContextLength = ref.getBases().length;
|
||||
|
||||
// this is how far back to move from the event to start copying bases
|
||||
final int offset = halfReadLength - halfAlleleLength;
|
||||
|
||||
// number of bases copied to the haplotype
|
||||
int copiedCount = 0;
|
||||
|
||||
// copy bases before the event
|
||||
final int locusPosOnRefContext = (int)(ref.getLocus().getStart() - ref.getWindow().getStart());
|
||||
int posOnRefContext = locusPosOnRefContext - offset;
|
||||
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset);
|
||||
int copiedCount = offset;
|
||||
if ( offset >= 0 && posOnRefContext >= 0 && posOnRefContext + offset <= refContextLength )
|
||||
{
|
||||
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, 0, offset);
|
||||
copiedCount = offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
String msg = new String("Can not copy reference bases to haplotype: ");
|
||||
if ( offset < 0 )
|
||||
msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ")";
|
||||
else
|
||||
msg += "Reference position(" + posOnRefContext + ") < 0";
|
||||
logger.info(msg);
|
||||
return null;
|
||||
}
|
||||
|
||||
// copy the event bases
|
||||
System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength);
|
||||
copiedCount += alleleLength;
|
||||
if ( copiedCount + alleleLength <= readLength )
|
||||
{
|
||||
System.arraycopy(allele.getBases(), 0, haplotype, copiedCount, alleleLength);
|
||||
copiedCount += alleleLength;
|
||||
}
|
||||
else
|
||||
{
|
||||
String msg = new String("Can not copy allele bases to haplotype: ");
|
||||
msg += "Read length(" + readLength + ") < Allele length(" + alleleLength + ") + copied count(" + copiedCount + ")";
|
||||
logger.info(msg);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// copy bases after the event
|
||||
posOnRefContext = locusPosOnRefContext + refLength;
|
||||
final int remainder = readLength - copiedCount;
|
||||
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder);
|
||||
if ( remainder > 0 && posOnRefContext + remainder <= refContextLength )
|
||||
{
|
||||
System.arraycopy(ref.getBases(), posOnRefContext, haplotype, copiedCount, remainder);
|
||||
copiedCount += remainder;
|
||||
}
|
||||
else
|
||||
{
|
||||
String msg = new String("Can not copy remaining reference bases to haplotype: ");
|
||||
msg += "Read length(" + readLength + ") <= Copied count(" + copiedCount + ")";
|
||||
logger.info(msg);
|
||||
return null;
|
||||
}
|
||||
|
||||
final String cigar;
|
||||
if ( refLength == alleleLength )
|
||||
|
|
@ -303,12 +345,17 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
|
|||
* @param vc the (bi-allelic) variant context for which to generate artificial reads
|
||||
* @param ref the original reference context
|
||||
* @param useAFAsAlleleFraction use AF tag to indicate allele fraction
|
||||
* @return true if successful generation of artificial reads for the variant, false otherwise
|
||||
*/
|
||||
private void generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) {
|
||||
private boolean generateReadsForVariant(final VariantContext vc, final ReferenceContext ref, final boolean useAFAsAlleleFraction) {
|
||||
|
||||
final int refLength = vc.getReference().getBases().length;
|
||||
final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref);
|
||||
final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref);
|
||||
final ArtificialHaplotype refHap = constructHaplotype(vc.getReference(), refLength, ref);
|
||||
if ( refHap == null )
|
||||
return false;
|
||||
final ArtificialHaplotype altHap = constructHaplotype(vc.getAlternateAllele(0), refLength, ref);
|
||||
if ( altHap == null )
|
||||
return false;
|
||||
|
||||
final double refAlleleFraction = (useAFAsAlleleFraction)?1-vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, 0.5):0.5;
|
||||
|
||||
|
|
@ -328,6 +375,8 @@ public class SimulateReadsForVariants extends RodWalker<Integer, Integer> {
|
|||
writeRead(readBases, vc.getChr(), vc.getStart() - haplotype.offset, haplotype.cigar, g.getSampleName(), gi++ % 2 == 0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -110,4 +110,24 @@ public class SimulateReadsForVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLongInsertFailure() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s",
|
||||
1,
|
||||
Arrays.asList("bb412c1fc8f95523dd2fc623d53dbeec"));
|
||||
executeTest("testLongInsertFailure", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLongInsertSuccess() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-RL 269 -T SimulateReadsForVariants --no_pg_tag -R " + b37KGReference + " -V " + publicTestDir + "forLongInsert.vcf -o %s",
|
||||
1,
|
||||
Arrays.asList("9236320c470cd8d6759c21b79206f63f"));
|
||||
executeTest("testLongInsertSuccess", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,16 @@
|
|||
##fileformat=VCFv4.1
|
||||
##fileDate=20140820
|
||||
##source=mutatrix population genome simulator
|
||||
##seed=100
|
||||
##reference=ref.fasta
|
||||
##phasing=true
|
||||
##commandline=mutatrix ref.fasta --file-prefix mutations/ --sample-prefix 0 -n 1 --ploidy 2 --random-seed 100 --snp-rate 0.001
|
||||
##INFO=<ID=AC,Number=A,Type=Integer,Description="Alternate allele count">
|
||||
##INFO=<ID=TYPE,Number=A,Type=String,Description="Type of each allele (snp, ins, del, mnp, complex)">
|
||||
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples at the site">
|
||||
##INFO=<ID=NA,Number=1,Type=Integer,Description="Number of alternate alleles">
|
||||
##INFO=<ID=LEN,Number=A,Type=Integer,Description="Length of each alternate allele">
|
||||
##INFO=<ID=MICROSAT,Number=0,Type=Flag,Description="Generated at a sequence repeat loci">
|
||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
||||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 01
|
||||
20 10000000 . T TGTCTTAGTGTCGTATGGGCTTCCATATTCAACTTTCTTCTATAAGTAGAATATCTACACGTGATGCTCTGGTCTTTCTACTACACGTCTATTGTAGCTTAATCTTTTCCTCGGGGGGGAAGAGGCTGTTTAGAATCATACCTCCAACCGACATTAACCCTGTTGGATTATAACTAGGGGCAAATCCGGATATTGGTATACGGCCCTATTATTCTATGGGTCTACGCACCTAGTAGGCACCCACGCAGTCTCTGGCCCGACCCCA 99 . AC=2;LEN=264;NA=1;NS=1;TYPE=ins GT 1|1
|
||||
Binary file not shown.
Loading…
Reference in New Issue