diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java index a650e0f6f..58ff0a7b3 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java @@ -47,38 +47,63 @@ package org.broadinstitute.sting.gatk.walkers.fasta; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; public class FastaAlternateReferenceIntegrationTest extends WalkerTest { + @Test - public void testIntervals() { + public void testReferenceOnly() { - String md5_1 = "328d2d52cedfdc52da7d1abff487633d"; - - WalkerTestSpec spec1a = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s", - 1, - Arrays.asList(md5_1)); - executeTest("testFastaReference", spec1a); - - WalkerTestSpec spec1b = new WalkerTestSpec( + WalkerTestSpec spec = new WalkerTestSpec( "-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s", 1, - Arrays.asList(md5_1)); - executeTest("testFastaReference", spec1b); + Arrays.asList("328d2d52cedfdc52da7d1abff487633d")); + executeTest("test FastaReference", spec); + } - WalkerTestSpec spec2 = new WalkerTestSpec( + @Test + public void testIndelsAndSnpMask() { + + WalkerTestSpec spec = new WalkerTestSpec( "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s", 1, Arrays.asList("ef481be9962e21d09847b8a1d4a4ff65")); - executeTest("testFastaAlternateReferenceIndels", spec2); + executeTest("test indels", spec); + } - WalkerTestSpec spec3 = new WalkerTestSpec( + @Test + public void testSnps() { + + WalkerTestSpec spec = new WalkerTestSpec( "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s", 1, Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641")); - executeTest("testFastaAlternateReferenceSnps", spec3); + executeTest("test SNPs", spec); + } + + @Test + public void testBadIupacInput() { + + // cannot use 'expectedExceptions = UserException.BadInput.class' because it technically gets thrown as a RuntimeException by the engine + try { + WalkerTestSpec spec = new WalkerTestSpec( + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " --useIUPAC -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s", + 1, + Arrays.asList("FAILFAILFAILFAILFAILFAILFAILFAIL")); + executeTest("test bad input", spec); + } catch (Exception e) {} // do nothing + } + + @Test + public void testIupac() { + + WalkerTestSpec spec = new WalkerTestSpec( + "-T FastaAlternateReferenceMaker -R " + b37KGReference + " --useIUPAC -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf -L 20:61050-66380 -o %s", + 1, + Arrays.asList("5feb2a576ff2ed1745a007eaa36448b3")); + executeTest("test iupac", spec); } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index d2f2e32b3..2ff9ade5f 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -25,21 +25,29 @@ package org.broadinstitute.sting.gatk.walkers.fasta; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.util.Collections; +import java.util.Arrays; import java.util.List; +import java.util.Set; /** @@ -88,53 +96,94 @@ import java.util.List; public class FastaAlternateReferenceMaker extends FastaReferenceMaker { /** - * Variants from these input files are used by this tool to construct an alternate reference. + * Variants from this input file are used by this tool to construct an alternate reference. */ - @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) - public List> variants = Collections.emptyList(); + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** - * Snps from this file are used as a mask when constructing the alternate reference. + * Snps from this file are used as a mask (inserting N's in the sequence) when constructing the alternate reference + * (regardless of whether they overlap a variant site). */ @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) - public RodBinding snpmask; + protected RodBinding snpmask; + + /** + * This option works only for VCFs with genotypes for exactly one sample; anything else will generate an error. + * Non-diploid (or non-called) genotypes are ignored. + */ + @Argument(fullName="useIUPAC", shortName="useIUPAC", doc = "If specified, heterozygous SNP sites will be output using IUPAC codes", required=false) + protected boolean useIUPACcodes = false; + private String iupacSample = null; private int deletionBasesRemaining = 0; - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + @Override + public void initialize() { + super.initialize(); + if ( useIUPACcodes ) { + final List rodName = Arrays.asList(variantCollection.variants.getName()); + final Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); + if ( samples.size() != 1 ) + throw new UserException.BadInput("the --useIUPAC option works only on VCF files with genotypes for exactly one sample, but the input file has " + samples.size() + " samples"); + iupacSample = samples.iterator().next(); + } + } + + @Override + public Pair map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { if (deletionBasesRemaining > 0) { deletionBasesRemaining--; - return new Pair(context.getLocation(), ""); + return new Pair<>(context.getLocation(), ""); } - String refBase = String.valueOf((char)ref.getBase()); + final String refBase = String.valueOf((char)ref.getBase()); // Check to see if we have a called snp - for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) { + for ( final VariantContext vc : tracker.getValues(variantCollection.variants, ref.getLocus()) ) { if ( vc.isFiltered() ) continue; if ( vc.isSimpleDeletion()) { deletionBasesRemaining = vc.getReference().length() - 1; // delete the next n bases, not this one - return new Pair(context.getLocation(), refBase); + return new Pair<>(context.getLocation(), refBase); } else if ( vc.isSimpleInsertion()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); + return new Pair<>(context.getLocation(), vc.getAlternateAllele(0).toString()); } else if (vc.isSNP()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); + final String base = useIUPACcodes ? getIUPACbase(vc.getGenotype(iupacSample), refBase) : vc.getAlternateAllele(0).toString(); + return new Pair<>(context.getLocation(), base); } } // if we don't have a called site, and we have a mask at this site, mask it - for ( VariantContext vc : tracker.getValues(snpmask) ) { + for ( final VariantContext vc : tracker.getValues(snpmask) ) { if ( vc.isSNP()) { - return new Pair(context.getLocation(), "N"); + return new Pair<>(context.getLocation(), "N"); } } - // if we got here then we're just ref - return new Pair(context.getLocation(), refBase); + return new Pair<>(context.getLocation(), refBase); + } + + /** + * Returns the IUPAC encoding for the given genotype or the reference base if not possible + * + * @param genotype the genotype to encode + * @param ref the reference base + * @return non-null, non-empty String + */ + private String getIUPACbase(final Genotype genotype, final String ref) { + if ( genotype == null ) + throw new IllegalStateException("The genotype is null for sample " + iupacSample); + + if ( !genotype.isHet() ) + return genotype.isHom() ? genotype.getAllele(0).getBaseString() : ref; + + final byte allele1 = genotype.getAllele(0).getBases()[0]; + final byte allele2 = genotype.getAllele(1).getBases()[0]; + return new String(new byte[] {BaseUtils.basesToIUPAC(allele1, allele2)}); } } \ No newline at end of file diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java index 1bbf481b1..c61e9fdfe 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/BaseUtils.java @@ -284,6 +284,36 @@ public class BaseUtils { return bases; } + /** + * Converts a pair of bases to their IUPAC ambiguity code + * + * @param base1 1st base + * @param base2 2nd base + * @return byte + */ + static public byte basesToIUPAC(final byte base1, final byte base2) { + // ensure that the bases come in order + if ( base2 < base1 ) + return basesToIUPAC(base2, base1); + + // ensure that the bases are regular ones + if ( !isRegularBase(base1) || !isRegularBase(base2) ) + return Base.N.base; + + // IUPAC codes are not needed if the bases are identical + if ( basesAreEqual(base1, base2) ) + return base1; + + if ( base1 == Base.A.base ) + return (byte)(base2 == Base.C.base ? 'M' : (base2 == Base.G.base ? 'R' : 'W')); + + if ( base1 == Base.C.base ) + return (byte)(base2 == Base.G.base ? 'S' : 'Y'); + + // the only possibility left is G/T + return 'K'; + } + /** * Converts a simple base to a base index * diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java index 6f645b34d..e38d472eb 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java @@ -64,6 +64,24 @@ public class BaseUtilsUnitTest extends BaseTest { Assert.assertEquals(b1[i], b2[i]); } + @Test + public void testConvertBasesToIUPAC() { + + for ( final BaseUtils.Base b : BaseUtils.Base.values() ) { + if ( BaseUtils.isRegularBase(b.base) ) + Assert.assertEquals(BaseUtils.basesToIUPAC(b.base, b.base), b.base, "testing same base"); + } + + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'X'), 'N', "testing non-standard base"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'A'), 'N', "testing non-standard base"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'X'), 'N', "testing non-standard base"); + + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'T'), 'W', "testing A/T=W"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'T', (byte)'A'), 'W', "testing T/A=W"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'G', (byte) 'T'), 'K', "testing G/T=K"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'T', (byte) 'G'), 'K', "testing T/G=K"); + } + @Test public void testTransitionTransversion() { logger.warn("Executing testTransitionTransversion");