Added new functionality to the FastaAlternateReferenceMaker to have it output IUPAC codes for het sites.
Enable it with the new --useIUPAC argument. Added both unit and integration tests for the new functionality - and fixed up the exising tests once I was in there.
This commit is contained in:
parent
4e74e77e74
commit
ffaf92f871
|
|
@ -47,38 +47,63 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.fasta;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
public void testIntervals() {
|
||||
public void testReferenceOnly() {
|
||||
|
||||
String md5_1 = "328d2d52cedfdc52da7d1abff487633d";
|
||||
|
||||
WalkerTestSpec spec1a = new WalkerTestSpec(
|
||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
|
||||
1,
|
||||
Arrays.asList(md5_1));
|
||||
executeTest("testFastaReference", spec1a);
|
||||
|
||||
WalkerTestSpec spec1b = new WalkerTestSpec(
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
|
||||
1,
|
||||
Arrays.asList(md5_1));
|
||||
executeTest("testFastaReference", spec1b);
|
||||
Arrays.asList("328d2d52cedfdc52da7d1abff487633d"));
|
||||
executeTest("test FastaReference", spec);
|
||||
}
|
||||
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||
@Test
|
||||
public void testIndelsAndSnpMask() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s",
|
||||
1,
|
||||
Arrays.asList("ef481be9962e21d09847b8a1d4a4ff65"));
|
||||
executeTest("testFastaAlternateReferenceIndels", spec2);
|
||||
executeTest("test indels", spec);
|
||||
}
|
||||
|
||||
WalkerTestSpec spec3 = new WalkerTestSpec(
|
||||
@Test
|
||||
public void testSnps() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
|
||||
1,
|
||||
Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641"));
|
||||
executeTest("testFastaAlternateReferenceSnps", spec3);
|
||||
executeTest("test SNPs", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadIupacInput() {
|
||||
|
||||
// cannot use 'expectedExceptions = UserException.BadInput.class' because it technically gets thrown as a RuntimeException by the engine
|
||||
try {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " --useIUPAC -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
|
||||
1,
|
||||
Arrays.asList("FAILFAILFAILFAILFAILFAILFAILFAIL"));
|
||||
executeTest("test bad input", spec);
|
||||
} catch (Exception e) {} // do nothing
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIupac() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FastaAlternateReferenceMaker -R " + b37KGReference + " --useIUPAC -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf -L 20:61050-66380 -o %s",
|
||||
1,
|
||||
Arrays.asList("5feb2a576ff2ed1745a007eaa36448b3"));
|
||||
executeTest("test iupac", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,21 +25,29 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.fasta;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -88,53 +96,94 @@ import java.util.List;
|
|||
public class FastaAlternateReferenceMaker extends FastaReferenceMaker {
|
||||
|
||||
/**
|
||||
* Variants from these input files are used by this tool to construct an alternate reference.
|
||||
* Variants from this input file are used by this tool to construct an alternate reference.
|
||||
*/
|
||||
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
|
||||
public List<RodBinding<VariantContext>> variants = Collections.emptyList();
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
/**
|
||||
* Snps from this file are used as a mask when constructing the alternate reference.
|
||||
* Snps from this file are used as a mask (inserting N's in the sequence) when constructing the alternate reference
|
||||
* (regardless of whether they overlap a variant site).
|
||||
*/
|
||||
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
|
||||
public RodBinding<VariantContext> snpmask;
|
||||
protected RodBinding<VariantContext> snpmask;
|
||||
|
||||
/**
|
||||
* This option works only for VCFs with genotypes for exactly one sample; anything else will generate an error.
|
||||
* Non-diploid (or non-called) genotypes are ignored.
|
||||
*/
|
||||
@Argument(fullName="useIUPAC", shortName="useIUPAC", doc = "If specified, heterozygous SNP sites will be output using IUPAC codes", required=false)
|
||||
protected boolean useIUPACcodes = false;
|
||||
private String iupacSample = null;
|
||||
|
||||
private int deletionBasesRemaining = 0;
|
||||
|
||||
public Pair<GenomeLoc, String> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
if ( useIUPACcodes ) {
|
||||
final List<String> rodName = Arrays.asList(variantCollection.variants.getName());
|
||||
final Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
|
||||
if ( samples.size() != 1 )
|
||||
throw new UserException.BadInput("the --useIUPAC option works only on VCF files with genotypes for exactly one sample, but the input file has " + samples.size() + " samples");
|
||||
iupacSample = samples.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<GenomeLoc, String> map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
|
||||
if (deletionBasesRemaining > 0) {
|
||||
deletionBasesRemaining--;
|
||||
return new Pair<GenomeLoc, String>(context.getLocation(), "");
|
||||
return new Pair<>(context.getLocation(), "");
|
||||
}
|
||||
|
||||
String refBase = String.valueOf((char)ref.getBase());
|
||||
final String refBase = String.valueOf((char)ref.getBase());
|
||||
|
||||
// Check to see if we have a called snp
|
||||
for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) {
|
||||
for ( final VariantContext vc : tracker.getValues(variantCollection.variants, ref.getLocus()) ) {
|
||||
if ( vc.isFiltered() )
|
||||
continue;
|
||||
|
||||
if ( vc.isSimpleDeletion()) {
|
||||
deletionBasesRemaining = vc.getReference().length() - 1;
|
||||
// delete the next n bases, not this one
|
||||
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
|
||||
return new Pair<>(context.getLocation(), refBase);
|
||||
} else if ( vc.isSimpleInsertion()) {
|
||||
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
|
||||
return new Pair<>(context.getLocation(), vc.getAlternateAllele(0).toString());
|
||||
} else if (vc.isSNP()) {
|
||||
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
|
||||
final String base = useIUPACcodes ? getIUPACbase(vc.getGenotype(iupacSample), refBase) : vc.getAlternateAllele(0).toString();
|
||||
return new Pair<>(context.getLocation(), base);
|
||||
}
|
||||
}
|
||||
|
||||
// if we don't have a called site, and we have a mask at this site, mask it
|
||||
for ( VariantContext vc : tracker.getValues(snpmask) ) {
|
||||
for ( final VariantContext vc : tracker.getValues(snpmask) ) {
|
||||
if ( vc.isSNP()) {
|
||||
return new Pair<GenomeLoc, String>(context.getLocation(), "N");
|
||||
return new Pair<>(context.getLocation(), "N");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// if we got here then we're just ref
|
||||
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
|
||||
return new Pair<>(context.getLocation(), refBase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the IUPAC encoding for the given genotype or the reference base if not possible
|
||||
*
|
||||
* @param genotype the genotype to encode
|
||||
* @param ref the reference base
|
||||
* @return non-null, non-empty String
|
||||
*/
|
||||
private String getIUPACbase(final Genotype genotype, final String ref) {
|
||||
if ( genotype == null )
|
||||
throw new IllegalStateException("The genotype is null for sample " + iupacSample);
|
||||
|
||||
if ( !genotype.isHet() )
|
||||
return genotype.isHom() ? genotype.getAllele(0).getBaseString() : ref;
|
||||
|
||||
final byte allele1 = genotype.getAllele(0).getBases()[0];
|
||||
final byte allele2 = genotype.getAllele(1).getBases()[0];
|
||||
return new String(new byte[] {BaseUtils.basesToIUPAC(allele1, allele2)});
|
||||
}
|
||||
}
|
||||
|
|
@ -284,6 +284,36 @@ public class BaseUtils {
|
|||
return bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a pair of bases to their IUPAC ambiguity code
|
||||
*
|
||||
* @param base1 1st base
|
||||
* @param base2 2nd base
|
||||
* @return byte
|
||||
*/
|
||||
static public byte basesToIUPAC(final byte base1, final byte base2) {
|
||||
// ensure that the bases come in order
|
||||
if ( base2 < base1 )
|
||||
return basesToIUPAC(base2, base1);
|
||||
|
||||
// ensure that the bases are regular ones
|
||||
if ( !isRegularBase(base1) || !isRegularBase(base2) )
|
||||
return Base.N.base;
|
||||
|
||||
// IUPAC codes are not needed if the bases are identical
|
||||
if ( basesAreEqual(base1, base2) )
|
||||
return base1;
|
||||
|
||||
if ( base1 == Base.A.base )
|
||||
return (byte)(base2 == Base.C.base ? 'M' : (base2 == Base.G.base ? 'R' : 'W'));
|
||||
|
||||
if ( base1 == Base.C.base )
|
||||
return (byte)(base2 == Base.G.base ? 'S' : 'Y');
|
||||
|
||||
// the only possibility left is G/T
|
||||
return 'K';
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a simple base to a base index
|
||||
*
|
||||
|
|
|
|||
|
|
@ -64,6 +64,24 @@ public class BaseUtilsUnitTest extends BaseTest {
|
|||
Assert.assertEquals(b1[i], b2[i]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConvertBasesToIUPAC() {
|
||||
|
||||
for ( final BaseUtils.Base b : BaseUtils.Base.values() ) {
|
||||
if ( BaseUtils.isRegularBase(b.base) )
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC(b.base, b.base), b.base, "testing same base");
|
||||
}
|
||||
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'X'), 'N', "testing non-standard base");
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'A'), 'N', "testing non-standard base");
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'X'), 'N', "testing non-standard base");
|
||||
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'T'), 'W', "testing A/T=W");
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'T', (byte)'A'), 'W', "testing T/A=W");
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'G', (byte) 'T'), 'K', "testing G/T=K");
|
||||
Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'T', (byte) 'G'), 'K', "testing T/G=K");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTransitionTransversion() {
|
||||
logger.warn("Executing testTransitionTransversion");
|
||||
|
|
|
|||
Loading…
Reference in New Issue