Moving over to 4.0 and away from VCFRecord

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3778 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-07-13 14:07:10 +00:00
parent d896d03554
commit 7e7da75d27
5 changed files with 208 additions and 262 deletions

View File

@ -1,178 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
/**
* Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci.
*/
public class VCFSubsetWalker extends RodWalker<ArrayList<VCFRecord>, VCFWriter> {
@Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false)
private HashSet<String> SAMPLES;
@Argument(fullName="vcfsubset", shortName="O", doc="File to write VCF subset to", required=false)
private File VPATH = null;
@Argument(fullName="includeNonVariants", shortName="INV", doc="Include non-variant loci", required=false)
private boolean INCLUDE_NON_VARIANTS = false;
@Argument(fullName="includeFiltered", shortName="IF", doc="Include filtered loci", required=false)
private boolean INCLUDE_FILTERED = false;
private VCFHeader vheader = null;
private VCFWriter vwriter = null;
public void initializeWriter() {
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
metaData.add(new VCFHeaderLine("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath()));
Set<String> additionalColumns = new HashSet<String>();
additionalColumns.add("FORMAT");
additionalColumns.addAll(SAMPLES);
vheader = new VCFHeader(metaData, additionalColumns);
if (VPATH != null) {
vwriter = new VCFWriter(VPATH);
vwriter.writeHeader(vheader);
}
}
public ArrayList<VCFRecord> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ArrayList<VCFRecord> records = new ArrayList<VCFRecord>();
if (tracker != null) {
for (GATKFeature feature : tracker.getAllRods()) {
Object rod = feature.getUnderlyingObject();
if (rod instanceof VCFRecord) {
VCFRecord vcfrod = (VCFRecord) rod;
if (SAMPLES == null) {
SAMPLES = new HashSet<String>();
SAMPLES.addAll(vcfrod.getHeader().getGenotypeSamples());
}
if (VPATH != null && vwriter == null) {
initializeWriter();
}
//out.println(record.toStringEncoding(vcfrod.getHeader()));
records.add(vcfrod);
}
}
}
return records;
}
public VCFWriter reduceInit() {
return vwriter;
}
private VCFRecord subsetRecord(VCFRecord record) {
ArrayList<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
HashSet<VCFGenotypeEncoding> genotypeEncodingSet = new HashSet<VCFGenotypeEncoding>();
for ( VCFGenotypeRecord gr : record.getVCFGenotypeRecords() ) {
if (SAMPLES.contains(gr.getSampleName())) {
genotypeRecords.add(gr);
for (VCFGenotypeEncoding allele : gr.getAlleles()) {
if (!allele.getBases().equalsIgnoreCase(record.getReference())) {
genotypeEncodingSet.add(allele);
}
}
}
}
ArrayList<VCFGenotypeEncoding> genotypeEncodings = new ArrayList<VCFGenotypeEncoding>();
for (VCFGenotypeEncoding allele : genotypeEncodingSet) {
if (!allele.getBases().equalsIgnoreCase(".")) {
genotypeEncodings.add(allele);
}
}
VCFRecord subset = new VCFRecord(record.getReference(),
record.getChr(),
record.getStart(),
record.getID(),
genotypeEncodings,
record.getQual(),
record.getFilterString(),
record.getInfoValues(),
record.getGenotypeFormatString(),
genotypeRecords);
return subset;
}
public VCFWriter reduce(ArrayList<VCFRecord> records, VCFWriter writer) {
for (VCFRecord record : records) {
VCFRecord subset = subsetRecord(record);
boolean isVariant = false;
if (subset.getVCFGenotypeRecords().size() > 0) {
for ( VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles() ) {
if (!record.getReference().equals(ge.getBases())) {
isVariant = true;
}
}
}
if ((isVariant || INCLUDE_NON_VARIANTS) && (!subset.isFiltered() || INCLUDE_FILTERED)) {
if (vwriter != null) {
vwriter.addRecord(subset);
} else {
out.println(subset.toStringEncoding(vheader));
}
}
}
return writer;
}
public void onTraversalDone(VCFWriter writer) {
if (vwriter != null) {
vwriter.close();
}
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The Broad Institute
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -28,25 +28,30 @@ package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlContext;
import org.apache.commons.jexl2.MapContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.vcf.VCFFilterHeaderLine;
import org.broad.tribble.vcf.VCFHeader;
import java.util.*;
/**
* Selects variant calls for output from a user-supplied VCF file using a number of user-selectable, parameterizable criteria. [TODO -- update to new walker style]
*/
@Requires(value={},referenceMetaData=@RMD(name="variant",type= VCFRecord.class))
public class VCFSelectWalker extends RodWalker<Integer, Integer> {
@Requires(value={},referenceMetaData=@RMD(name="variant", type= ReferenceOrderedDatum.class))
public class VariantSelect extends RodWalker<Integer, Integer> {
@Argument(fullName="match", shortName="match", doc="Expression used with INFO fields to select VCF records for inclusion in the output VCF(see wiki docs for more info)", required=false)
protected String[] MATCH_STRINGS = new String[]{null};
@ -66,21 +71,6 @@ public class VCFSelectWalker extends RodWalker<Integer, Integer> {
private List<MatchExp> matchExpressions = new ArrayList<MatchExp>();
private void initializeVcfWriter(VCFRecord record) {
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "VariantSelect"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
for ( MatchExp exp : matchExpressions ) {
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.expStr));
}
writer = new VCFWriter(out);
writer.writeHeader(new VCFHeader(hInfo, record.getHeader().getGenotypeSamples()));
}
public void initialize() {
for ( int i = 0; i < MATCH_STRINGS.length; i++ ) {
if ( MATCH_STRINGS[i] != null ) {
@ -92,6 +82,22 @@ public class VCFSelectWalker extends RodWalker<Integer, Integer> {
}
}
}
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "VariantSelect"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
for ( MatchExp exp : matchExpressions ) {
hInfo.add(new VCFFilterHeaderLine(exp.name, exp.expStr));
}
writer = new VCFWriter(out, true);
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant"));
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
writer.writeHeader(vcfHeader);
}
public Integer reduceInit() { return 0; }
@ -108,15 +114,15 @@ public class VCFSelectWalker extends RodWalker<Integer, Integer> {
if ( tracker == null )
return 0;
VCFRecord variant = tracker.lookup("variant",VCFRecord.class);
VariantContext vc = tracker.getVariantContext(ref, "variant", null, context.getLocation(), false);
// ignore places where we don't have a variant
if ( variant == null )
if ( vc == null )
return 0;
boolean someoneMatched = false;
for ( MatchExp exp : matchExpressions ) {
Map<String, Object> infoMap = new HashMap<String, Object>(variant.getInfoValues());
infoMap.put("QUAL", String.valueOf(variant.getQual()));
Map<String, Object> infoMap = new HashMap<String, Object>(vc.getAttributes());
infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual()));
JexlContext jContext = new MapContext(infoMap);
@ -133,29 +139,16 @@ public class VCFSelectWalker extends RodWalker<Integer, Integer> {
}
if ( someoneMatched )
writeVCF(variant);
writer.add(vc, new byte[]{ref.getBase()});
return 1;
}
private void writeVCF(VCFRecord variant) {
if ( writer == null )
initializeVcfWriter(variant);
writer.addRecord(variant);
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param result the number of loci seen.
*/
public void onTraversalDone(Integer result) {
if ( writer != null )
writer.close();
writer.close();
}
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import java.util.*;
/**
* Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci.
*/
public class VariantSubset extends RodWalker<Integer, Integer> {
@Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false)
private ArrayList<String> SAMPLES = null;
@Argument(fullName="includeNonVariants", shortName="INV", doc="Include non-variant loci", required=false)
private boolean INCLUDE_NON_VARIANTS = false;
@Argument(fullName="includeFiltered", shortName="IF", doc="Include filtered loci", required=false)
private boolean INCLUDE_FILTERED = false;
private VCFWriter writer;
public void initialize() {
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
metaData.add(new VCFHeaderLine("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath()));
writer = new VCFWriter(out, true);
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant"));
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
writer.writeHeader(vcfHeader);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref, null, context.getLocation(), true, false);
for (VariantContext vc : VCs) {
VariantContext subset = subsetRecord(vc);
if ( (vc.isPolymorphic() || INCLUDE_NON_VARIANTS) &&
(!subset.isFiltered() || INCLUDE_FILTERED) )
writer.add(subset, new byte[]{ref.getBase()});
}
return 1;
}
public Integer reduceInit() {
return 0;
}
private VariantContext subsetRecord(VariantContext vc) {
if ( SAMPLES == null || SAMPLES.isEmpty() )
return vc;
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
for ( Map.Entry<String, Genotype> genotypePair : vc.getGenotypes().entrySet() ) {
if ( SAMPLES.contains(genotypePair.getKey()) )
genotypes.add(genotypePair.getValue());
}
return vc.subContextFromGenotypes(genotypes);
}
public Integer reduce(Integer sum, Integer value) {
return 1;
}
public void onTraversalDone(Integer sum) {
writer.close();
}
}

View File

@ -1,46 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class VCFSelectIntegrationTest extends WalkerTest {
public static String baseTestString() {
return "-T VCFSelect -o %s -R " + oneKGLocation + "reference/human_b36_both.fasta";
}
@Test
public void testVCFSelect1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
Arrays.asList("b49ba344471444077bc6fe3c17e7bc3f"));
executeTest("testVCFSelect1", spec);
}
@Test
public void testVCFSelect2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1,
Arrays.asList("517b4ae7058c3125ad6846c33a1a2e57"));
executeTest("testVCFSelect2", spec);
}
@Test
public void testVCFSelectOr() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
Arrays.asList("d77d8f938a61abd60fc813ff1a06bb0c"));
executeTest("testVCFSelectOr", spec);
}
@Test
public void testVCFSelectAnd() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1,
Arrays.asList("ef05fc766482ffade95f1bbdb777770d"));
executeTest("testVCFSelectAnd", spec);
}
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
public class VariantSelectIntegrationTest extends WalkerTest {
public static String baseTestString() {
return "-T VariantSelect -o %s -R " + oneKGLocation + "reference/human_b36_both.fasta";
}
@Test
public void testVCFSelect1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
Arrays.asList("8b358e0cfa35de022a37360a6f28a839"));
executeTest("testVCFSelect1", spec);
}
@Test
public void testVCFSelect2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -L 1:10001290-10048590 ", 1,
Arrays.asList("8e991b9d6d610c8f89c8557756fc8e34"));
executeTest("testVCFSelect2", spec);
}
@Test
public void testVCFSelectOr() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6' -match 'AF == 0.50' -L 1:10001290-10048590 ", 1,
Arrays.asList("7bd064c8d8bf5389fcd0b78a7c32b599"));
executeTest("testVCFSelectOr", spec);
}
@Test
public void testVCFSelectAnd() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B variant,VCF," + validationDataLocation + "vcfexample3.vcf -match 'HomopolymerRun == 6 && AF == 0.50' -L 1:10001290-10048590 ", 1,
Arrays.asList("5af565836fa926feaa130715b93188bc"));
executeTest("testVCFSelectAnd", spec);
}
}