From 971834ca90ab8e65bba1f51309df1db2ae8f2ad3 Mon Sep 17 00:00:00 2001 From: ebanks Date: Sun, 10 Jan 2010 06:45:11 +0000 Subject: [PATCH] Added a walker to the vcf tools compilation: one that combines vcf records. Both merges and unions are supported (see documentation... when it gets written this week). Also, moved some code that pulls samples out of rods from VCFUtils into SampleUtils. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2552 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/annotator/VariantAnnotator.java | 2 +- .../concordance/CallsetConcordanceWalker.java | 2 +- .../gatk/walkers/vcftools/VCFCombine.java | 139 ++++++++++++++++++ .../walkers/vcftools/VCFSubsetWalker.java | 4 +- .../sting/utils/SampleUtils.java | 105 +++++++++++++ .../sting/utils/genotype/vcf/VCFUtils.java | 75 ---------- 6 files changed, 248 insertions(+), 79 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 23a482f6e..f6eac214e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -83,7 +83,7 @@ public class VariantAnnotator extends RodWalker { // get the list of all sample names from the various VCF input rods TreeSet samples = new TreeSet(); - VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); // add the non-VCF sample from the command-line, if applicable if ( sampleName != null ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java index a53c9184a..8222c4823 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java @@ -56,7 +56,7 @@ public class CallsetConcordanceWalker extends RodWalker { // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) HashSet samples = new HashSet(); - VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); for ( java.util.Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue()); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java new file mode 100755 index 000000000..762506e4f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java @@ -0,0 +1,139 @@ +package org.broadinstitute.sting.playground.gatk.walkers.vcftools; + +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; +import java.io.File; + +/** + * Combines VCF records from different sources; supports both full merges and set unions. + * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. + * Union: assumes each rod represents the same set of samples (although this is not enforced); using the + * priority list (if provided), emits a single record instance at every position represented in the rods. + */ +@Requires(value={}) +public class VCFCombine extends RodWalker { + // the types of combinations we currently allow + public enum COMBINATION_TYPE { + UNION, MERGE + } + + + @Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false) + protected File OUTPUT_FILE = null; + + @Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true) + protected COMBINATION_TYPE COMBO_TYPE; + + @Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; if not specified, then the first rod seen will be used", required=false) + protected String PRIORITY_STRING = null; + + + private VCFWriter vcfWriter = null; + + private String[] priority = null; + + // a map of rod name to uniquified sample name + private HashMap, String> rodNamesToSampleNames; + + + public void initialize() { + + Set metaData = new HashSet(); + metaData.add(new VCFHeaderLine("source", "VCF")); + + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + + if ( OUTPUT_FILE != null ) + vcfWriter = new VCFWriter(OUTPUT_FILE); + else + vcfWriter = new VCFWriter(out); + + Set samples; + switch (COMBO_TYPE ) { + case MERGE: + samples = new TreeSet(); + rodNamesToSampleNames = new HashMap, String>(); + // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); + break; + case UNION: + samples = SampleUtils.getUniqueSamplesFromRods(getToolkit()); + break; + default: + throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE); + } + + vcfWriter.writeHeader(new VCFHeader(hInfo, samples)); + + if ( PRIORITY_STRING != null ) + priority = PRIORITY_STRING.split(","); + } + + public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + // get all of the vcf rods at this locus + ArrayList vcfRods = new ArrayList(); + Iterator rods = tracker.getAllRods().iterator(); + while (rods.hasNext()) { + ReferenceOrderedDatum rod = rods.next(); + if ( rod instanceof RodVCF ) + vcfRods.add((RodVCF)rod); + } + + if ( vcfRods.size() == 0 ) + return null; + + VCFRecord record = null; + switch (COMBO_TYPE ) { + case MERGE: + record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames); + break; + case UNION: + record = vcfUnion(vcfRods); + break; + default: + break; + } + + return record; + } + + public VCFWriter reduceInit() { + return vcfWriter; + } + + private VCFRecord vcfUnion(ArrayList rods) { + if ( priority == null ) + return rods.get(0).mCurrentRecord; + + for ( String rodname : priority ) { + for ( RodVCF rod : rods ) { + if ( rod.getName().equals(rodname) ) + return rod.mCurrentRecord; + } + } + + return null; + } + + public VCFWriter reduce(VCFRecord record, VCFWriter writer) { + if ( record != null ) + writer.addRecord(record); + return writer; + } + + public void onTraversalDone(VCFWriter writer) { + if ( writer != null ) + writer.close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java index ac5091198..4ee3584b7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.vcftools; -import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.RodVCF; @@ -15,7 +15,7 @@ import java.io.File; /** * Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci. */ -public class VCFSubsetWalker extends RefWalker, VCFWriter> { +public class VCFSubsetWalker extends RodWalker, VCFWriter> { @Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false) private HashSet SAMPLES; diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 8f68f209b..2b5ef2974 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -7,6 +7,11 @@ import java.util.*; import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.SampleBacked; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; /** @@ -51,4 +56,104 @@ public class SampleUtils { } return samples; } + + /** + * Gets all of the unique sample names from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return the set of unique samples + */ + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) { + Set samples = new TreeSet(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + ReferenceOrderedData rod = source.getReferenceOrderedData(); + if ( rod.getType().equals(RodVCF.class) ) { + VCFReader reader = new VCFReader(rod.getFile()); + samples.addAll(reader.getHeader().getGenotypeSamples()); + reader.close(); + } + } + + return samples; + } + + /** + * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap + * (e.g. sampleX.1, sampleX.2, ...) + * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping + * from rod/sample pairs to the new uniquified names + * + * @param toolkit GATK engine + * @param samples set to store the sample names + * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names + */ + public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { + + // keep a map of sample name to occurrences encountered + HashMap sampleOverlapMap = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + ReferenceOrderedData rod = source.getReferenceOrderedData(); + if ( rod.getType().equals(RodVCF.class) ) { + VCFReader reader = new VCFReader(rod.getFile()); + Set vcfSamples = reader.getHeader().getGenotypeSamples(); + for ( String sample : vcfSamples ) + addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); + reader.close(); + } + } + } + + private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { + + // how many occurrences have we seen so far? + Integer occurrences = sampleOverlapMap.get(newSample); + + // if this is the first one, just add it to the list of samples + if ( occurrences == null ) { + samples.add(newSample); + rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); + sampleOverlapMap.put(newSample, 1); + } + + // if it's already been seen multiple times, give it a unique suffix and increment the value + else if ( occurrences >= 2 ) { + String uniqueName = newSample + "." + rodName; + samples.add(uniqueName); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); + sampleOverlapMap.put(newSample, occurrences + 1); + } + + // if this is the second occurrence of the sample name, uniquify both of them + else { // occurrences == 2 + + // remove the 1st occurrence, uniquify it, and add it back + samples.remove(newSample); + String uniqueName1 = null; + for ( Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { + if ( entry.getValue().equals(newSample) ) { + uniqueName1 = newSample + "." + entry.getKey().first; + entry.setValue(uniqueName1); + break; + } + } + samples.add(uniqueName1); + + // add the second one + String uniqueName2 = newSample + "." + rodName; + samples.add(uniqueName2); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); + + sampleOverlapMap.put(newSample, 2); + } + + } + + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java index 67a721b26..46d7de6ac 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -9,7 +9,6 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.*; import java.util.*; -import java.util.Map.Entry; /** * A set of static utility methods for common operations on VCF files/records. @@ -47,80 +46,6 @@ public class VCFUtils { return fields; } - /** - * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap - * (e.g. sampleX.1, sampleX.2, ...) - * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping - * from rod/sample pairs to the new uniquified names - * - * @param toolkit GATK engine - * @param samples set to store the sample names - * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names - */ - public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { - - // keep a map of sample name to occurrences encountered - HashMap sampleOverlapMap = new HashMap(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - ReferenceOrderedData rod = source.getReferenceOrderedData(); - if ( rod.getType().equals(RodVCF.class) ) { - VCFReader reader = new VCFReader(rod.getFile()); - Set vcfSamples = reader.getHeader().getGenotypeSamples(); - for ( String sample : vcfSamples ) - addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); - reader.close(); - } - } - } - - private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { - - // how many occurrences have we seen so far? - Integer occurrences = sampleOverlapMap.get(newSample); - - // if this is the first one, just add it to the list of samples - if ( occurrences == null ) { - samples.add(newSample); - rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); - sampleOverlapMap.put(newSample, 1); - } - - // if it's already been seen multiple times, give it a unique suffix and increment the value - else if ( occurrences >= 2 ) { - String uniqueName = newSample + "." + rodName; - samples.add(uniqueName); - rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); - sampleOverlapMap.put(newSample, occurrences + 1); - } - - // if this is the second occurrence of the sample name, uniquify both of them - else { // occurrences == 2 - - // remove the 1st occurrence, uniquify it, and add it back - samples.remove(newSample); - String uniqueName1 = null; - for ( Entry, String> entry : rodNamesToSampleNames.entrySet() ) { - if ( entry.getValue().equals(newSample) ) { - uniqueName1 = newSample + "." + entry.getKey().first; - entry.setValue(uniqueName1); - break; - } - } - samples.add(uniqueName1); - - // add the second one - String uniqueName2 = newSample + "." + rodName; - samples.add(uniqueName2); - rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); - - sampleOverlapMap.put(newSample, 2); - } - - } - /** * Merges various vcf records into a single one using the mapping from rodNamesToSampleNames to get unique sample names *