Added new functionality to the FastaAlternateReferenceMaker to have it output IUPAC codes for het sites.
Enable it with the new --useIUPAC argument. Added both unit and integration tests for the new functionality - and fixed up the exising tests once I was in there.
This commit is contained in:
parent
4e74e77e74
commit
ffaf92f871
|
|
@ -47,38 +47,63 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.fasta;
|
package org.broadinstitute.sting.gatk.walkers.fasta;
|
||||||
|
|
||||||
import org.broadinstitute.sting.WalkerTest;
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
|
public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIntervals() {
|
public void testReferenceOnly() {
|
||||||
|
|
||||||
String md5_1 = "328d2d52cedfdc52da7d1abff487633d";
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
|
||||||
WalkerTestSpec spec1a = new WalkerTestSpec(
|
|
||||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
|
|
||||||
1,
|
|
||||||
Arrays.asList(md5_1));
|
|
||||||
executeTest("testFastaReference", spec1a);
|
|
||||||
|
|
||||||
WalkerTestSpec spec1b = new WalkerTestSpec(
|
|
||||||
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
|
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList(md5_1));
|
Arrays.asList("328d2d52cedfdc52da7d1abff487633d"));
|
||||||
executeTest("testFastaReference", spec1b);
|
executeTest("test FastaReference", spec);
|
||||||
|
}
|
||||||
|
|
||||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
@Test
|
||||||
|
public void testIndelsAndSnpMask() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s",
|
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("ef481be9962e21d09847b8a1d4a4ff65"));
|
Arrays.asList("ef481be9962e21d09847b8a1d4a4ff65"));
|
||||||
executeTest("testFastaAlternateReferenceIndels", spec2);
|
executeTest("test indels", spec);
|
||||||
|
}
|
||||||
|
|
||||||
WalkerTestSpec spec3 = new WalkerTestSpec(
|
@Test
|
||||||
|
public void testSnps() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
|
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641"));
|
Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641"));
|
||||||
executeTest("testFastaAlternateReferenceSnps", spec3);
|
executeTest("test SNPs", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadIupacInput() {
|
||||||
|
|
||||||
|
// cannot use 'expectedExceptions = UserException.BadInput.class' because it technically gets thrown as a RuntimeException by the engine
|
||||||
|
try {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " --useIUPAC -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
|
||||||
|
1,
|
||||||
|
Arrays.asList("FAILFAILFAILFAILFAILFAILFAILFAIL"));
|
||||||
|
executeTest("test bad input", spec);
|
||||||
|
} catch (Exception e) {} // do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIupac() {
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T FastaAlternateReferenceMaker -R " + b37KGReference + " --useIUPAC -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf -L 20:61050-66380 -o %s",
|
||||||
|
1,
|
||||||
|
Arrays.asList("5feb2a576ff2ed1745a007eaa36448b3"));
|
||||||
|
executeTest("test iupac", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -25,21 +25,29 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.fasta;
|
package org.broadinstitute.sting.gatk.walkers.fasta;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||||
import org.broadinstitute.sting.commandline.Input;
|
import org.broadinstitute.sting.commandline.Input;
|
||||||
import org.broadinstitute.sting.commandline.RodBinding;
|
import org.broadinstitute.sting.commandline.RodBinding;
|
||||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||||
|
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -88,53 +96,94 @@ import java.util.List;
|
||||||
public class FastaAlternateReferenceMaker extends FastaReferenceMaker {
|
public class FastaAlternateReferenceMaker extends FastaReferenceMaker {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Variants from these input files are used by this tool to construct an alternate reference.
|
* Variants from this input file are used by this tool to construct an alternate reference.
|
||||||
*/
|
*/
|
||||||
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
|
@ArgumentCollection
|
||||||
public List<RodBinding<VariantContext>> variants = Collections.emptyList();
|
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Snps from this file are used as a mask when constructing the alternate reference.
|
* Snps from this file are used as a mask (inserting N's in the sequence) when constructing the alternate reference
|
||||||
|
* (regardless of whether they overlap a variant site).
|
||||||
*/
|
*/
|
||||||
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
|
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
|
||||||
public RodBinding<VariantContext> snpmask;
|
protected RodBinding<VariantContext> snpmask;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This option works only for VCFs with genotypes for exactly one sample; anything else will generate an error.
|
||||||
|
* Non-diploid (or non-called) genotypes are ignored.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="useIUPAC", shortName="useIUPAC", doc = "If specified, heterozygous SNP sites will be output using IUPAC codes", required=false)
|
||||||
|
protected boolean useIUPACcodes = false;
|
||||||
|
private String iupacSample = null;
|
||||||
|
|
||||||
private int deletionBasesRemaining = 0;
|
private int deletionBasesRemaining = 0;
|
||||||
|
|
||||||
public Pair<GenomeLoc, String> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
@Override
|
||||||
|
public void initialize() {
|
||||||
|
super.initialize();
|
||||||
|
if ( useIUPACcodes ) {
|
||||||
|
final List<String> rodName = Arrays.asList(variantCollection.variants.getName());
|
||||||
|
final Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
|
||||||
|
if ( samples.size() != 1 )
|
||||||
|
throw new UserException.BadInput("the --useIUPAC option works only on VCF files with genotypes for exactly one sample, but the input file has " + samples.size() + " samples");
|
||||||
|
iupacSample = samples.iterator().next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Pair<GenomeLoc, String> map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||||
|
|
||||||
if (deletionBasesRemaining > 0) {
|
if (deletionBasesRemaining > 0) {
|
||||||
deletionBasesRemaining--;
|
deletionBasesRemaining--;
|
||||||
return new Pair<GenomeLoc, String>(context.getLocation(), "");
|
return new Pair<>(context.getLocation(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
String refBase = String.valueOf((char)ref.getBase());
|
final String refBase = String.valueOf((char)ref.getBase());
|
||||||
|
|
||||||
// Check to see if we have a called snp
|
// Check to see if we have a called snp
|
||||||
for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) {
|
for ( final VariantContext vc : tracker.getValues(variantCollection.variants, ref.getLocus()) ) {
|
||||||
if ( vc.isFiltered() )
|
if ( vc.isFiltered() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if ( vc.isSimpleDeletion()) {
|
if ( vc.isSimpleDeletion()) {
|
||||||
deletionBasesRemaining = vc.getReference().length() - 1;
|
deletionBasesRemaining = vc.getReference().length() - 1;
|
||||||
// delete the next n bases, not this one
|
// delete the next n bases, not this one
|
||||||
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
|
return new Pair<>(context.getLocation(), refBase);
|
||||||
} else if ( vc.isSimpleInsertion()) {
|
} else if ( vc.isSimpleInsertion()) {
|
||||||
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
|
return new Pair<>(context.getLocation(), vc.getAlternateAllele(0).toString());
|
||||||
} else if (vc.isSNP()) {
|
} else if (vc.isSNP()) {
|
||||||
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
|
final String base = useIUPACcodes ? getIUPACbase(vc.getGenotype(iupacSample), refBase) : vc.getAlternateAllele(0).toString();
|
||||||
|
return new Pair<>(context.getLocation(), base);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we don't have a called site, and we have a mask at this site, mask it
|
// if we don't have a called site, and we have a mask at this site, mask it
|
||||||
for ( VariantContext vc : tracker.getValues(snpmask) ) {
|
for ( final VariantContext vc : tracker.getValues(snpmask) ) {
|
||||||
if ( vc.isSNP()) {
|
if ( vc.isSNP()) {
|
||||||
return new Pair<GenomeLoc, String>(context.getLocation(), "N");
|
return new Pair<>(context.getLocation(), "N");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// if we got here then we're just ref
|
// if we got here then we're just ref
|
||||||
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
|
return new Pair<>(context.getLocation(), refBase);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the IUPAC encoding for the given genotype or the reference base if not possible
|
||||||
|
*
|
||||||
|
* @param genotype the genotype to encode
|
||||||
|
* @param ref the reference base
|
||||||
|
* @return non-null, non-empty String
|
||||||
|
*/
|
||||||
|
private String getIUPACbase(final Genotype genotype, final String ref) {
|
||||||
|
if ( genotype == null )
|
||||||
|
throw new IllegalStateException("The genotype is null for sample " + iupacSample);
|
||||||
|
|
||||||
|
if ( !genotype.isHet() )
|
||||||
|
return genotype.isHom() ? genotype.getAllele(0).getBaseString() : ref;
|
||||||
|
|
||||||
|
final byte allele1 = genotype.getAllele(0).getBases()[0];
|
||||||
|
final byte allele2 = genotype.getAllele(1).getBases()[0];
|
||||||
|
return new String(new byte[] {BaseUtils.basesToIUPAC(allele1, allele2)});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -284,6 +284,36 @@ public class BaseUtils {
|
||||||
return bases;
|
return bases;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a pair of bases to their IUPAC ambiguity code
|
||||||
|
*
|
||||||
|
* @param base1 1st base
|
||||||
|
* @param base2 2nd base
|
||||||
|
* @return byte
|
||||||
|
*/
|
||||||
|
static public byte basesToIUPAC(final byte base1, final byte base2) {
|
||||||
|
// ensure that the bases come in order
|
||||||
|
if ( base2 < base1 )
|
||||||
|
return basesToIUPAC(base2, base1);
|
||||||
|
|
||||||
|
// ensure that the bases are regular ones
|
||||||
|
if ( !isRegularBase(base1) || !isRegularBase(base2) )
|
||||||
|
return Base.N.base;
|
||||||
|
|
||||||
|
// IUPAC codes are not needed if the bases are identical
|
||||||
|
if ( basesAreEqual(base1, base2) )
|
||||||
|
return base1;
|
||||||
|
|
||||||
|
if ( base1 == Base.A.base )
|
||||||
|
return (byte)(base2 == Base.C.base ? 'M' : (base2 == Base.G.base ? 'R' : 'W'));
|
||||||
|
|
||||||
|
if ( base1 == Base.C.base )
|
||||||
|
return (byte)(base2 == Base.G.base ? 'S' : 'Y');
|
||||||
|
|
||||||
|
// the only possibility left is G/T
|
||||||
|
return 'K';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a simple base to a base index
|
* Converts a simple base to a base index
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -64,6 +64,24 @@ public class BaseUtilsUnitTest extends BaseTest {
|
||||||
Assert.assertEquals(b1[i], b2[i]);
|
Assert.assertEquals(b1[i], b2[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testConvertBasesToIUPAC() {
|
||||||
|
|
||||||
|
for ( final BaseUtils.Base b : BaseUtils.Base.values() ) {
|
||||||
|
if ( BaseUtils.isRegularBase(b.base) )
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC(b.base, b.base), b.base, "testing same base");
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'X'), 'N', "testing non-standard base");
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'A'), 'N', "testing non-standard base");
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'X'), 'N', "testing non-standard base");
|
||||||
|
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'T'), 'W', "testing A/T=W");
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'T', (byte)'A'), 'W', "testing T/A=W");
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'G', (byte) 'T'), 'K', "testing G/T=K");
|
||||||
|
Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'T', (byte) 'G'), 'K', "testing T/G=K");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTransitionTransversion() {
|
public void testTransitionTransversion() {
|
||||||
logger.warn("Executing testTransitionTransversion");
|
logger.warn("Executing testTransitionTransversion");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue