diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 23a482f6e..f6eac214e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -83,7 +83,7 @@ public class VariantAnnotator extends RodWalker { // get the list of all sample names from the various VCF input rods TreeSet samples = new TreeSet(); - VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); // add the non-VCF sample from the command-line, if applicable if ( sampleName != null ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java index a53c9184a..8222c4823 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/concordance/CallsetConcordanceWalker.java @@ -56,7 +56,7 @@ public class CallsetConcordanceWalker extends RodWalker { // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) HashSet samples = new HashSet(); - VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); for ( java.util.Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue()); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java new file mode 100755 index 000000000..762506e4f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFCombine.java @@ -0,0 +1,139 @@ +package org.broadinstitute.sting.playground.gatk.walkers.vcftools; + +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; +import java.io.File; + +/** + * Combines VCF records from different sources; supports both full merges and set unions. + * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. + * Union: assumes each rod represents the same set of samples (although this is not enforced); using the + * priority list (if provided), emits a single record instance at every position represented in the rods. + */ +@Requires(value={}) +public class VCFCombine extends RodWalker { + // the types of combinations we currently allow + public enum COMBINATION_TYPE { + UNION, MERGE + } + + + @Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false) + protected File OUTPUT_FILE = null; + + @Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true) + protected COMBINATION_TYPE COMBO_TYPE; + + @Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; if not specified, then the first rod seen will be used", required=false) + protected String PRIORITY_STRING = null; + + + private VCFWriter vcfWriter = null; + + private String[] priority = null; + + // a map of rod name to uniquified sample name + private HashMap, String> rodNamesToSampleNames; + + + public void initialize() { + + Set metaData = new HashSet(); + metaData.add(new VCFHeaderLine("source", "VCF")); + + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + + if ( OUTPUT_FILE != null ) + vcfWriter = new VCFWriter(OUTPUT_FILE); + else + vcfWriter = new VCFWriter(out); + + Set samples; + switch (COMBO_TYPE ) { + case MERGE: + samples = new TreeSet(); + rodNamesToSampleNames = new HashMap, String>(); + // get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap) + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames); + break; + case UNION: + samples = SampleUtils.getUniqueSamplesFromRods(getToolkit()); + break; + default: + throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE); + } + + vcfWriter.writeHeader(new VCFHeader(hInfo, samples)); + + if ( PRIORITY_STRING != null ) + priority = PRIORITY_STRING.split(","); + } + + public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + // get all of the vcf rods at this locus + ArrayList vcfRods = new ArrayList(); + Iterator rods = tracker.getAllRods().iterator(); + while (rods.hasNext()) { + ReferenceOrderedDatum rod = rods.next(); + if ( rod instanceof RodVCF ) + vcfRods.add((RodVCF)rod); + } + + if ( vcfRods.size() == 0 ) + return null; + + VCFRecord record = null; + switch (COMBO_TYPE ) { + case MERGE: + record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames); + break; + case UNION: + record = vcfUnion(vcfRods); + break; + default: + break; + } + + return record; + } + + public VCFWriter reduceInit() { + return vcfWriter; + } + + private VCFRecord vcfUnion(ArrayList rods) { + if ( priority == null ) + return rods.get(0).mCurrentRecord; + + for ( String rodname : priority ) { + for ( RodVCF rod : rods ) { + if ( rod.getName().equals(rodname) ) + return rod.mCurrentRecord; + } + } + + return null; + } + + public VCFWriter reduce(VCFRecord record, VCFWriter writer) { + if ( record != null ) + writer.addRecord(record); + return writer; + } + + public void onTraversalDone(VCFWriter writer) { + if ( writer != null ) + writer.close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java index ac5091198..4ee3584b7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.vcftools; -import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.RodVCF; @@ -15,7 +15,7 @@ import java.io.File; /** * Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci. */ -public class VCFSubsetWalker extends RefWalker, VCFWriter> { +public class VCFSubsetWalker extends RodWalker, VCFWriter> { @Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false) private HashSet SAMPLES; diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 8f68f209b..2b5ef2974 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -7,6 +7,11 @@ import java.util.*; import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.SampleBacked; +import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; /** @@ -51,4 +56,104 @@ public class SampleUtils { } return samples; } + + /** + * Gets all of the unique sample names from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return the set of unique samples + */ + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) { + Set samples = new TreeSet(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + ReferenceOrderedData rod = source.getReferenceOrderedData(); + if ( rod.getType().equals(RodVCF.class) ) { + VCFReader reader = new VCFReader(rod.getFile()); + samples.addAll(reader.getHeader().getGenotypeSamples()); + reader.close(); + } + } + + return samples; + } + + /** + * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap + * (e.g. sampleX.1, sampleX.2, ...) + * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping + * from rod/sample pairs to the new uniquified names + * + * @param toolkit GATK engine + * @param samples set to store the sample names + * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names + */ + public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { + + // keep a map of sample name to occurrences encountered + HashMap sampleOverlapMap = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + ReferenceOrderedData rod = source.getReferenceOrderedData(); + if ( rod.getType().equals(RodVCF.class) ) { + VCFReader reader = new VCFReader(rod.getFile()); + Set vcfSamples = reader.getHeader().getGenotypeSamples(); + for ( String sample : vcfSamples ) + addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); + reader.close(); + } + } + } + + private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { + + // how many occurrences have we seen so far? + Integer occurrences = sampleOverlapMap.get(newSample); + + // if this is the first one, just add it to the list of samples + if ( occurrences == null ) { + samples.add(newSample); + rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); + sampleOverlapMap.put(newSample, 1); + } + + // if it's already been seen multiple times, give it a unique suffix and increment the value + else if ( occurrences >= 2 ) { + String uniqueName = newSample + "." + rodName; + samples.add(uniqueName); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); + sampleOverlapMap.put(newSample, occurrences + 1); + } + + // if this is the second occurrence of the sample name, uniquify both of them + else { // occurrences == 2 + + // remove the 1st occurrence, uniquify it, and add it back + samples.remove(newSample); + String uniqueName1 = null; + for ( Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { + if ( entry.getValue().equals(newSample) ) { + uniqueName1 = newSample + "." + entry.getKey().first; + entry.setValue(uniqueName1); + break; + } + } + samples.add(uniqueName1); + + // add the second one + String uniqueName2 = newSample + "." + rodName; + samples.add(uniqueName2); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); + + sampleOverlapMap.put(newSample, 2); + } + + } + + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java index 67a721b26..46d7de6ac 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -9,7 +9,6 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.*; import java.util.*; -import java.util.Map.Entry; /** * A set of static utility methods for common operations on VCF files/records. @@ -47,80 +46,6 @@ public class VCFUtils { return fields; } - /** - * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap - * (e.g. sampleX.1, sampleX.2, ...) - * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping - * from rod/sample pairs to the new uniquified names - * - * @param toolkit GATK engine - * @param samples set to store the sample names - * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names - */ - public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { - - // keep a map of sample name to occurrences encountered - HashMap sampleOverlapMap = new HashMap(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - ReferenceOrderedData rod = source.getReferenceOrderedData(); - if ( rod.getType().equals(RodVCF.class) ) { - VCFReader reader = new VCFReader(rod.getFile()); - Set vcfSamples = reader.getHeader().getGenotypeSamples(); - for ( String sample : vcfSamples ) - addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); - reader.close(); - } - } - } - - private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { - - // how many occurrences have we seen so far? - Integer occurrences = sampleOverlapMap.get(newSample); - - // if this is the first one, just add it to the list of samples - if ( occurrences == null ) { - samples.add(newSample); - rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); - sampleOverlapMap.put(newSample, 1); - } - - // if it's already been seen multiple times, give it a unique suffix and increment the value - else if ( occurrences >= 2 ) { - String uniqueName = newSample + "." + rodName; - samples.add(uniqueName); - rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); - sampleOverlapMap.put(newSample, occurrences + 1); - } - - // if this is the second occurrence of the sample name, uniquify both of them - else { // occurrences == 2 - - // remove the 1st occurrence, uniquify it, and add it back - samples.remove(newSample); - String uniqueName1 = null; - for ( Entry, String> entry : rodNamesToSampleNames.entrySet() ) { - if ( entry.getValue().equals(newSample) ) { - uniqueName1 = newSample + "." + entry.getKey().first; - entry.setValue(uniqueName1); - break; - } - } - samples.add(uniqueName1); - - // add the second one - String uniqueName2 = newSample + "." + rodName; - samples.add(uniqueName2); - rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); - - sampleOverlapMap.put(newSample, 2); - } - - } - /** * Merges various vcf records into a single one using the mapping from rodNamesToSampleNames to get unique sample names *