Merge pull request #556 from broadinstitute/eb_use_iupac_in_FARM

Added new functionality to the FastaAlternateReferenceMaker to have it o...
This commit is contained in:
Eric Banks 2014-03-12 14:33:06 -04:00
commit d3a4b57491
4 changed files with 155 additions and 33 deletions

View File

@ -47,38 +47,63 @@
package org.broadinstitute.sting.gatk.walkers.fasta;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import java.util.Arrays;
public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {
public void testReferenceOnly() {
String md5_1 = "328d2d52cedfdc52da7d1abff487633d";
WalkerTestSpec spec1a = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
1,
Arrays.asList(md5_1));
executeTest("testFastaReference", spec1a);
WalkerTestSpec spec1b = new WalkerTestSpec(
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
1,
Arrays.asList(md5_1));
executeTest("testFastaReference", spec1b);
Arrays.asList("328d2d52cedfdc52da7d1abff487633d"));
executeTest("test FastaReference", spec);
}
WalkerTestSpec spec2 = new WalkerTestSpec(
@Test
public void testIndelsAndSnpMask() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s",
1,
Arrays.asList("ef481be9962e21d09847b8a1d4a4ff65"));
executeTest("testFastaAlternateReferenceIndels", spec2);
executeTest("test indels", spec);
}
WalkerTestSpec spec3 = new WalkerTestSpec(
@Test
public void testSnps() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
1,
Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641"));
executeTest("testFastaAlternateReferenceSnps", spec3);
executeTest("test SNPs", spec);
}
@Test
public void testBadIupacInput() {
// cannot use 'expectedExceptions = UserException.BadInput.class' because it technically gets thrown as a RuntimeException by the engine
try {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " --useIUPAC -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
1,
Arrays.asList("FAILFAILFAILFAILFAILFAILFAILFAIL"));
executeTest("test bad input", spec);
} catch (Exception e) {} // do nothing
}
@Test
public void testIupac() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b37KGReference + " --useIUPAC -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf -L 20:61050-66380 -o %s",
1,
Arrays.asList("5feb2a576ff2ed1745a007eaa36448b3"));
executeTest("test iupac", spec);
}
}

View File

@ -25,21 +25,29 @@
package org.broadinstitute.sting.gatk.walkers.fasta;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.util.Collections;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
/**
@ -88,53 +96,94 @@ import java.util.List;
public class FastaAlternateReferenceMaker extends FastaReferenceMaker {
/**
* Variants from these input files are used by this tool to construct an alternate reference.
* Variants from this input file are used by this tool to construct an alternate reference.
*/
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
public List<RodBinding<VariantContext>> variants = Collections.emptyList();
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* Snps from this file are used as a mask when constructing the alternate reference.
* Snps from this file are used as a mask (inserting N's in the sequence) when constructing the alternate reference
* (regardless of whether they overlap a variant site).
*/
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
public RodBinding<VariantContext> snpmask;
protected RodBinding<VariantContext> snpmask;
/**
* This option works only for VCFs with genotypes for exactly one sample; anything else will generate an error.
* Non-diploid (or non-called) genotypes are ignored.
*/
@Argument(fullName="useIUPAC", shortName="useIUPAC", doc = "If specified, heterozygous SNP sites will be output using IUPAC codes", required=false)
protected boolean useIUPACcodes = false;
private String iupacSample = null;
private int deletionBasesRemaining = 0;
public Pair<GenomeLoc, String> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@Override
public void initialize() {
super.initialize();
if ( useIUPACcodes ) {
final List<String> rodName = Arrays.asList(variantCollection.variants.getName());
final Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
if ( samples.size() != 1 )
throw new UserException.BadInput("the --useIUPAC option works only on VCF files with genotypes for exactly one sample, but the input file has " + samples.size() + " samples");
iupacSample = samples.iterator().next();
}
}
@Override
public Pair<GenomeLoc, String> map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if (deletionBasesRemaining > 0) {
deletionBasesRemaining--;
return new Pair<GenomeLoc, String>(context.getLocation(), "");
return new Pair<>(context.getLocation(), "");
}
String refBase = String.valueOf((char)ref.getBase());
final String refBase = String.valueOf((char)ref.getBase());
// Check to see if we have a called snp
for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) {
for ( final VariantContext vc : tracker.getValues(variantCollection.variants, ref.getLocus()) ) {
if ( vc.isFiltered() )
continue;
if ( vc.isSimpleDeletion()) {
deletionBasesRemaining = vc.getReference().length() - 1;
// delete the next n bases, not this one
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
return new Pair<>(context.getLocation(), refBase);
} else if ( vc.isSimpleInsertion()) {
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
return new Pair<>(context.getLocation(), vc.getAlternateAllele(0).toString());
} else if (vc.isSNP()) {
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
final String base = useIUPACcodes ? getIUPACbase(vc.getGenotype(iupacSample), refBase) : vc.getAlternateAllele(0).toString();
return new Pair<>(context.getLocation(), base);
}
}
// if we don't have a called site, and we have a mask at this site, mask it
for ( VariantContext vc : tracker.getValues(snpmask) ) {
for ( final VariantContext vc : tracker.getValues(snpmask) ) {
if ( vc.isSNP()) {
return new Pair<GenomeLoc, String>(context.getLocation(), "N");
return new Pair<>(context.getLocation(), "N");
}
}
// if we got here then we're just ref
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
return new Pair<>(context.getLocation(), refBase);
}
/**
* Returns the IUPAC encoding for the given genotype or the reference base if not possible
*
* @param genotype the genotype to encode
* @param ref the reference base
* @return non-null, non-empty String
*/
private String getIUPACbase(final Genotype genotype, final String ref) {
if ( genotype == null )
throw new IllegalStateException("The genotype is null for sample " + iupacSample);
if ( !genotype.isHet() )
return genotype.isHom() ? genotype.getAllele(0).getBaseString() : ref;
final byte allele1 = genotype.getAllele(0).getBases()[0];
final byte allele2 = genotype.getAllele(1).getBases()[0];
return new String(new byte[] {BaseUtils.basesToIUPAC(allele1, allele2)});
}
}

View File

@ -284,6 +284,36 @@ public class BaseUtils {
return bases;
}
/**
* Converts a pair of bases to their IUPAC ambiguity code
*
* @param base1 1st base
* @param base2 2nd base
* @return byte
*/
static public byte basesToIUPAC(final byte base1, final byte base2) {
// ensure that the bases come in order
if ( base2 < base1 )
return basesToIUPAC(base2, base1);
// ensure that the bases are regular ones
if ( !isRegularBase(base1) || !isRegularBase(base2) )
return Base.N.base;
// IUPAC codes are not needed if the bases are identical
if ( basesAreEqual(base1, base2) )
return base1;
if ( base1 == Base.A.base )
return (byte)(base2 == Base.C.base ? 'M' : (base2 == Base.G.base ? 'R' : 'W'));
if ( base1 == Base.C.base )
return (byte)(base2 == Base.G.base ? 'S' : 'Y');
// the only possibility left is G/T
return 'K';
}
/**
* Converts a simple base to a base index
*

View File

@ -64,6 +64,24 @@ public class BaseUtilsUnitTest extends BaseTest {
Assert.assertEquals(b1[i], b2[i]);
}
@Test
public void testConvertBasesToIUPAC() {
for ( final BaseUtils.Base b : BaseUtils.Base.values() ) {
if ( BaseUtils.isRegularBase(b.base) )
Assert.assertEquals(BaseUtils.basesToIUPAC(b.base, b.base), b.base, "testing same base");
}
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'X'), 'N', "testing non-standard base");
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'A'), 'N', "testing non-standard base");
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'X'), 'N', "testing non-standard base");
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'T'), 'W', "testing A/T=W");
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'T', (byte)'A'), 'W', "testing T/A=W");
Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'G', (byte) 'T'), 'K', "testing G/T=K");
Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'T', (byte) 'G'), 'K', "testing T/G=K");
}
@Test
public void testTransitionTransversion() {
logger.warn("Executing testTransitionTransversion");