diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java deleted file mode 100755 index 58d678956..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.vcftools; - -import org.broad.tribble.vcf.*; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.genotype.vcf.*; -import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; - -import java.io.File; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Set; - -/** - * Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci. - */ -public class VCFSubsetWalker extends RodWalker, VCFWriter> { - @Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false) - private HashSet SAMPLES; - - @Argument(fullName="vcfsubset", shortName="O", doc="File to write VCF subset to", required=false) - private File VPATH = null; - - @Argument(fullName="includeNonVariants", shortName="INV", doc="Include non-variant loci", required=false) - private boolean INCLUDE_NON_VARIANTS = false; - - @Argument(fullName="includeFiltered", shortName="IF", doc="Include filtered loci", required=false) - private boolean INCLUDE_FILTERED = false; - - private VCFHeader vheader = null; - private VCFWriter vwriter = null; - - public void initializeWriter() { - Set metaData = new HashSet(); - metaData.add(new VCFHeaderLine("source", "VariantsToVCF")); - metaData.add(new VCFHeaderLine("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath())); - - Set additionalColumns = new HashSet(); - additionalColumns.add("FORMAT"); - additionalColumns.addAll(SAMPLES); - - vheader = new VCFHeader(metaData, additionalColumns); - if (VPATH != null) { - vwriter = new VCFWriter(VPATH); - vwriter.writeHeader(vheader); - } - } - - public ArrayList map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - ArrayList records = new ArrayList(); - - if (tracker != null) { - for (GATKFeature feature : tracker.getAllRods()) { - Object rod = feature.getUnderlyingObject(); - if (rod instanceof VCFRecord) { - VCFRecord vcfrod = (VCFRecord) rod; - - if (SAMPLES == null) { - SAMPLES = new HashSet(); - SAMPLES.addAll(vcfrod.getHeader().getGenotypeSamples()); - } - - if (VPATH != null && vwriter == null) { - initializeWriter(); - } - - //out.println(record.toStringEncoding(vcfrod.getHeader())); - - records.add(vcfrod); - } - } - } - - return records; - } - - public VCFWriter reduceInit() { - return vwriter; - } - - private VCFRecord subsetRecord(VCFRecord record) { - ArrayList genotypeRecords = new ArrayList(); - HashSet genotypeEncodingSet = new HashSet(); - - for ( VCFGenotypeRecord gr : record.getVCFGenotypeRecords() ) { - if (SAMPLES.contains(gr.getSampleName())) { - genotypeRecords.add(gr); - - for (VCFGenotypeEncoding allele : gr.getAlleles()) { - if (!allele.getBases().equalsIgnoreCase(record.getReference())) { - genotypeEncodingSet.add(allele); - } - } - } - } - - ArrayList genotypeEncodings = new ArrayList(); - for (VCFGenotypeEncoding allele : genotypeEncodingSet) { - if (!allele.getBases().equalsIgnoreCase(".")) { - genotypeEncodings.add(allele); - } - } - - VCFRecord subset = new VCFRecord(record.getReference(), - record.getChr(), - record.getStart(), - record.getID(), - genotypeEncodings, - record.getQual(), - record.getFilterString(), - record.getInfoValues(), - record.getGenotypeFormatString(), - genotypeRecords); - - return subset; - } - - public VCFWriter reduce(ArrayList records, VCFWriter writer) { - for (VCFRecord record : records) { - VCFRecord subset = subsetRecord(record); - - boolean isVariant = false; - - if (subset.getVCFGenotypeRecords().size() > 0) { - for ( VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles() ) { - if (!record.getReference().equals(ge.getBases())) { - isVariant = true; - } - } - } - - if ((isVariant || INCLUDE_NON_VARIANTS) && (!subset.isFiltered() || INCLUDE_FILTERED)) { - if (vwriter != null) { - vwriter.addRecord(subset); - } else { - out.println(subset.toStringEncoding(vheader)); - } - } - } - - return writer; - } - - public void onTraversalDone(VCFWriter writer) { - if (vwriter != null) { - vwriter.close(); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSelectWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelect.java similarity index 82% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSelectWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelect.java index 0215fac66..6f6ddab98 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSelectWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelect.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2010. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -28,25 +28,30 @@ package org.broadinstitute.sting.playground.gatk.walkers.vcftools; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlContext; import org.apache.commons.jexl2.MapContext; -import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFFilterHeaderLine; +import org.broad.tribble.vcf.VCFHeader; import java.util.*; /** * Selects variant calls for output from a user-supplied VCF file using a number of user-selectable, parameterizable criteria. [TODO -- update to new walker style] */ -@Requires(value={},referenceMetaData=@RMD(name="variant",type= VCFRecord.class)) -public class VCFSelectWalker extends RodWalker { +@Requires(value={},referenceMetaData=@RMD(name="variant", type= ReferenceOrderedDatum.class)) +public class VariantSelect extends RodWalker { @Argument(fullName="match", shortName="match", doc="Expression used with INFO fields to select VCF records for inclusion in the output VCF(see wiki docs for more info)", required=false) protected String[] MATCH_STRINGS = new String[]{null}; @@ -66,21 +71,6 @@ public class VCFSelectWalker extends RodWalker { private List matchExpressions = new ArrayList(); - private void initializeVcfWriter(VCFRecord record) { - // setup the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("source", "VariantSelect")); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - for ( MatchExp exp : matchExpressions ) { - hInfo.add(new VCFFilterHeaderLine(exp.name, exp.expStr)); - } - - writer = new VCFWriter(out); - writer.writeHeader(new VCFHeader(hInfo, record.getHeader().getGenotypeSamples())); - } - public void initialize() { for ( int i = 0; i < MATCH_STRINGS.length; i++ ) { if ( MATCH_STRINGS[i] != null ) { @@ -92,6 +82,22 @@ public class VCFSelectWalker extends RodWalker { } } } + + // setup the header fields + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "VariantSelect")); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + for ( MatchExp exp : matchExpressions ) { + hInfo.add(new VCFFilterHeaderLine(exp.name, exp.expStr)); + } + + writer = new VCFWriter(out, true); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); + + final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + writer.writeHeader(vcfHeader); } public Integer reduceInit() { return 0; } @@ -108,15 +114,15 @@ public class VCFSelectWalker extends RodWalker { if ( tracker == null ) return 0; - VCFRecord variant = tracker.lookup("variant",VCFRecord.class); + VariantContext vc = tracker.getVariantContext(ref, "variant", null, context.getLocation(), false); // ignore places where we don't have a variant - if ( variant == null ) + if ( vc == null ) return 0; boolean someoneMatched = false; for ( MatchExp exp : matchExpressions ) { - Map infoMap = new HashMap(variant.getInfoValues()); - infoMap.put("QUAL", String.valueOf(variant.getQual())); + Map infoMap = new HashMap(vc.getAttributes()); + infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual())); JexlContext jContext = new MapContext(infoMap); @@ -133,29 +139,16 @@ public class VCFSelectWalker extends RodWalker { } if ( someoneMatched ) - writeVCF(variant); + writer.add(vc, new byte[]{ref.getBase()}); return 1; } - private void writeVCF(VCFRecord variant) { - if ( writer == null ) - initializeVcfWriter(variant); - - writer.addRecord(variant); - } - public Integer reduce(Integer value, Integer sum) { return sum + value; } - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ public void onTraversalDone(Integer result) { - if ( writer != null ) - writer.close(); + writer.close(); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSubset.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSubset.java new file mode 100755 index 000000000..e1f1f9d80 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSubset.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.vcftools; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; + +import java.util.*; + +/** + * Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci. + */ +public class VariantSubset extends RodWalker { + @Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false) + private ArrayList SAMPLES = null; + + @Argument(fullName="includeNonVariants", shortName="INV", doc="Include non-variant loci", required=false) + private boolean INCLUDE_NON_VARIANTS = false; + + @Argument(fullName="includeFiltered", shortName="IF", doc="Include filtered loci", required=false) + private boolean INCLUDE_FILTERED = false; + + private VCFWriter writer; + + public void initialize() { + Set metaData = new HashSet(); + metaData.add(new VCFHeaderLine("source", "VariantsToVCF")); + metaData.add(new VCFHeaderLine("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath())); + + writer = new VCFWriter(out, true); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); + + final VCFHeader vcfHeader = new VCFHeader(metaData, samples); + writer.writeHeader(vcfHeader); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + Collection VCs = tracker.getAllVariantContexts(ref, null, context.getLocation(), true, false); + for (VariantContext vc : VCs) { + VariantContext subset = subsetRecord(vc); + + if ( (vc.isPolymorphic() || INCLUDE_NON_VARIANTS) && + (!subset.isFiltered() || INCLUDE_FILTERED) ) + writer.add(subset, new byte[]{ref.getBase()}); + } + + return 1; + } + + public Integer reduceInit() { + return 0; + } + + private VariantContext subsetRecord(VariantContext vc) { + if ( SAMPLES == null || SAMPLES.isEmpty() ) + return vc; + + ArrayList genotypes = new ArrayList(); + for ( Map.Entry genotypePair : vc.getGenotypes().entrySet() ) { + if ( SAMPLES.contains(genotypePair.getKey()) ) + genotypes.add(genotypePair.getValue()); + } + + return vc.subContextFromGenotypes(genotypes); + } + + public Integer reduce(Integer sum, Integer value) { + return 1; + } + + public void onTraversalDone(Integer sum) { + writer.close(); + } +} diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSelectIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSelectIntegrationTest.java deleted file mode 100755 index c26b61e15..000000000 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSelectIntegrationTest.java +++ /dev/null @@ -1,46 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.vcftools; - -import org.broadinstitute.sting.WalkerTest; -import org.junit.Test; - -import java.util.Arrays; - -public class VCFSelectIntegrationTest extends WalkerTest { - - public static String baseTestString() { - return "-T VCFSelect -o %s -R " + oneKGLocation + "reference/human_b36_both.fasta"; - } - - - @Test - public void testVCFSelect1() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1, - Arrays.asList("b49ba344471444077bc6fe3c17e7bc3f")); - executeTest("testVCFSelect1", spec); - } - - @Test - public void testVCFSelect2() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1, - Arrays.asList("517b4ae7058c3125ad6846c33a1a2e57")); - executeTest("testVCFSelect2", spec); - } - - @Test - public void testVCFSelectOr() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1, - Arrays.asList("d77d8f938a61abd60fc813ff1a06bb0c")); - executeTest("testVCFSelectOr", spec); - } - - @Test - public void testVCFSelectAnd() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1, - Arrays.asList("ef05fc766482ffade95f1bbdb777770d")); - executeTest("testVCFSelectAnd", spec); - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java new file mode 100755 index 000000000..b6757da0b --- /dev/null +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VariantSelectIntegrationTest.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.vcftools; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.Arrays; + +public class VariantSelectIntegrationTest extends WalkerTest { + + public static String baseTestString() { + return "-T VariantSelect -o %s -R " + oneKGLocation + "reference/human_b36_both.fasta"; + } + + + @Test + public void testVCFSelect1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1, + Arrays.asList("8b358e0cfa35de022a37360a6f28a839")); + executeTest("testVCFSelect1", spec); + } + + @Test + public void testVCFSelect2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1, + Arrays.asList("8e991b9d6d610c8f89c8557756fc8e34")); + executeTest("testVCFSelect2", spec); + } + + @Test + public void testVCFSelectOr() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1, + Arrays.asList("7bd064c8d8bf5389fcd0b78a7c32b599")); + executeTest("testVCFSelectOr", spec); + } + + @Test + public void testVCFSelectAnd() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1, + Arrays.asList("5af565836fa926feaa130715b93188bc")); + executeTest("testVCFSelectAnd", spec); + } +} \ No newline at end of file