Added a walker to the vcf tools compilation: one that combines vcf records. Both merges and unions are supported (see documentation... when it gets written this week).
Also, moved some code that pulls samples out of rods from VCFUtils into SampleUtils. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2552 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
80af0f2f54
commit
971834ca90
|
|
@ -83,7 +83,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
|
|||
|
||||
// get the list of all sample names from the various VCF input rods
|
||||
TreeSet<String> samples = new TreeSet<String>();
|
||||
VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
||||
|
||||
// add the non-VCF sample from the command-line, if applicable
|
||||
if ( sampleName != null ) {
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
// get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
|
||||
HashSet<String> samples = new HashSet<String>();
|
||||
VCFUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
|
||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
|
||||
|
||||
for ( java.util.Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
|
||||
logger.debug("Uniquified sample mapping: " + entry.getKey().first + "/" + entry.getKey().second + " -> " + entry.getValue());
|
||||
|
|
|
|||
|
|
@ -0,0 +1,139 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Combines VCF records from different sources; supports both full merges and set unions.
|
||||
* Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
|
||||
* Union: assumes each rod represents the same set of samples (although this is not enforced); using the
|
||||
* priority list (if provided), emits a single record instance at every position represented in the rods.
|
||||
*/
|
||||
@Requires(value={})
|
||||
public class VCFCombine extends RodWalker<VCFRecord, VCFWriter> {
|
||||
// the types of combinations we currently allow
|
||||
public enum COMBINATION_TYPE {
|
||||
UNION, MERGE
|
||||
}
|
||||
|
||||
|
||||
@Argument(fullName="vcf_output_file", shortName="O", doc="VCF file to write results", required=false)
|
||||
protected File OUTPUT_FILE = null;
|
||||
|
||||
@Argument(fullName="combination_type", shortName="type", doc="combination type; currently UNION and MERGE are supported", required=true)
|
||||
protected COMBINATION_TYPE COMBO_TYPE;
|
||||
|
||||
@Argument(fullName="rod_priority_list", shortName="priority", doc="For the UNION combination type: a comma-separated string describing the priority ordering for the rods as far as which record gets emitted; if not specified, then the first rod seen will be used", required=false)
|
||||
protected String PRIORITY_STRING = null;
|
||||
|
||||
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
private String[] priority = null;
|
||||
|
||||
// a map of rod name to uniquified sample name
|
||||
private HashMap<Pair<String, String>, String> rodNamesToSampleNames;
|
||||
|
||||
|
||||
public void initialize() {
|
||||
|
||||
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
||||
metaData.add(new VCFHeaderLine("source", "VCF"));
|
||||
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
|
||||
if ( OUTPUT_FILE != null )
|
||||
vcfWriter = new VCFWriter(OUTPUT_FILE);
|
||||
else
|
||||
vcfWriter = new VCFWriter(out);
|
||||
|
||||
Set<String> samples;
|
||||
switch (COMBO_TYPE ) {
|
||||
case MERGE:
|
||||
samples = new TreeSet<String>();
|
||||
rodNamesToSampleNames = new HashMap<Pair<String, String>, String>();
|
||||
// get the list of all sample names from the various input rods (they need to be uniquified in case there's overlap)
|
||||
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, rodNamesToSampleNames);
|
||||
break;
|
||||
case UNION:
|
||||
samples = SampleUtils.getUniqueSamplesFromRods(getToolkit());
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported combination type: " + COMBO_TYPE);
|
||||
}
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(hInfo, samples));
|
||||
|
||||
if ( PRIORITY_STRING != null )
|
||||
priority = PRIORITY_STRING.split(",");
|
||||
}
|
||||
|
||||
public VCFRecord map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return null;
|
||||
|
||||
// get all of the vcf rods at this locus
|
||||
ArrayList<RodVCF> vcfRods = new ArrayList<RodVCF>();
|
||||
Iterator<ReferenceOrderedDatum> rods = tracker.getAllRods().iterator();
|
||||
while (rods.hasNext()) {
|
||||
ReferenceOrderedDatum rod = rods.next();
|
||||
if ( rod instanceof RodVCF )
|
||||
vcfRods.add((RodVCF)rod);
|
||||
}
|
||||
|
||||
if ( vcfRods.size() == 0 )
|
||||
return null;
|
||||
|
||||
VCFRecord record = null;
|
||||
switch (COMBO_TYPE ) {
|
||||
case MERGE:
|
||||
record = VCFUtils.mergeRecords(vcfRods, rodNamesToSampleNames);
|
||||
break;
|
||||
case UNION:
|
||||
record = vcfUnion(vcfRods);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
public VCFWriter reduceInit() {
|
||||
return vcfWriter;
|
||||
}
|
||||
|
||||
private VCFRecord vcfUnion(ArrayList<RodVCF> rods) {
|
||||
if ( priority == null )
|
||||
return rods.get(0).mCurrentRecord;
|
||||
|
||||
for ( String rodname : priority ) {
|
||||
for ( RodVCF rod : rods ) {
|
||||
if ( rod.getName().equals(rodname) )
|
||||
return rod.mCurrentRecord;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public VCFWriter reduce(VCFRecord record, VCFWriter writer) {
|
||||
if ( record != null )
|
||||
writer.addRecord(record);
|
||||
return writer;
|
||||
}
|
||||
|
||||
public void onTraversalDone(VCFWriter writer) {
|
||||
if ( writer != null )
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
|
|
@ -15,7 +15,7 @@ import java.io.File;
|
|||
/**
|
||||
* Extracts subsets of a VCF file like one or more samples, all or only variant loci, all or filtered loci.
|
||||
*/
|
||||
public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter> {
|
||||
public class VCFSubsetWalker extends RodWalker<ArrayList<VCFRecord>, VCFWriter> {
|
||||
@Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false)
|
||||
private HashSet<String> SAMPLES;
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,11 @@ import java.util.*;
|
|||
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.SampleBacked;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -51,4 +56,104 @@ public class SampleUtils {
|
|||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets all of the unique sample names from all VCF rods input by the user
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
*
|
||||
* @return the set of unique samples
|
||||
*/
|
||||
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) {
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(RodVCF.class) ) {
|
||||
VCFReader reader = new VCFReader(rod.getFile());
|
||||
samples.addAll(reader.getHeader().getGenotypeSamples());
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
* (e.g. sampleX.1, sampleX.2, ...)
|
||||
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
|
||||
* from rod/sample pairs to the new uniquified names
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
* @param samples set to store the sample names
|
||||
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||
*/
|
||||
public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> samples, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||
|
||||
// keep a map of sample name to occurrences encountered
|
||||
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(RodVCF.class) ) {
|
||||
VCFReader reader = new VCFReader(rod.getFile());
|
||||
Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
for ( String sample : vcfSamples )
|
||||
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUniqueSample(Set<String> samples, Map<String, Integer> sampleOverlapMap, Map<Pair<String, String>, String> rodNamesToSampleNames, String newSample, String rodName) {
|
||||
|
||||
// how many occurrences have we seen so far?
|
||||
Integer occurrences = sampleOverlapMap.get(newSample);
|
||||
|
||||
// if this is the first one, just add it to the list of samples
|
||||
if ( occurrences == null ) {
|
||||
samples.add(newSample);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), newSample);
|
||||
sampleOverlapMap.put(newSample, 1);
|
||||
}
|
||||
|
||||
// if it's already been seen multiple times, give it a unique suffix and increment the value
|
||||
else if ( occurrences >= 2 ) {
|
||||
String uniqueName = newSample + "." + rodName;
|
||||
samples.add(uniqueName);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName);
|
||||
sampleOverlapMap.put(newSample, occurrences + 1);
|
||||
}
|
||||
|
||||
// if this is the second occurrence of the sample name, uniquify both of them
|
||||
else { // occurrences == 2
|
||||
|
||||
// remove the 1st occurrence, uniquify it, and add it back
|
||||
samples.remove(newSample);
|
||||
String uniqueName1 = null;
|
||||
for ( Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
|
||||
if ( entry.getValue().equals(newSample) ) {
|
||||
uniqueName1 = newSample + "." + entry.getKey().first;
|
||||
entry.setValue(uniqueName1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
samples.add(uniqueName1);
|
||||
|
||||
// add the second one
|
||||
String uniqueName2 = newSample + "." + rodName;
|
||||
samples.add(uniqueName2);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName2);
|
||||
|
||||
sampleOverlapMap.put(newSample, 2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -9,7 +9,6 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/**
|
||||
* A set of static utility methods for common operations on VCF files/records.
|
||||
|
|
@ -47,80 +46,6 @@ public class VCFUtils {
|
|||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
* (e.g. sampleX.1, sampleX.2, ...)
|
||||
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
|
||||
* from rod/sample pairs to the new uniquified names
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
* @param samples set to store the sample names
|
||||
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
|
||||
*/
|
||||
public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> samples, Map<Pair<String, String>, String> rodNamesToSampleNames) {
|
||||
|
||||
// keep a map of sample name to occurrences encountered
|
||||
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
ReferenceOrderedData rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(RodVCF.class) ) {
|
||||
VCFReader reader = new VCFReader(rod.getFile());
|
||||
Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
for ( String sample : vcfSamples )
|
||||
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName());
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void addUniqueSample(Set<String> samples, Map<String, Integer> sampleOverlapMap, Map<Pair<String, String>, String> rodNamesToSampleNames, String newSample, String rodName) {
|
||||
|
||||
// how many occurrences have we seen so far?
|
||||
Integer occurrences = sampleOverlapMap.get(newSample);
|
||||
|
||||
// if this is the first one, just add it to the list of samples
|
||||
if ( occurrences == null ) {
|
||||
samples.add(newSample);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), newSample);
|
||||
sampleOverlapMap.put(newSample, 1);
|
||||
}
|
||||
|
||||
// if it's already been seen multiple times, give it a unique suffix and increment the value
|
||||
else if ( occurrences >= 2 ) {
|
||||
String uniqueName = newSample + "." + rodName;
|
||||
samples.add(uniqueName);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName);
|
||||
sampleOverlapMap.put(newSample, occurrences + 1);
|
||||
}
|
||||
|
||||
// if this is the second occurrence of the sample name, uniquify both of them
|
||||
else { // occurrences == 2
|
||||
|
||||
// remove the 1st occurrence, uniquify it, and add it back
|
||||
samples.remove(newSample);
|
||||
String uniqueName1 = null;
|
||||
for ( Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
|
||||
if ( entry.getValue().equals(newSample) ) {
|
||||
uniqueName1 = newSample + "." + entry.getKey().first;
|
||||
entry.setValue(uniqueName1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
samples.add(uniqueName1);
|
||||
|
||||
// add the second one
|
||||
String uniqueName2 = newSample + "." + rodName;
|
||||
samples.add(uniqueName2);
|
||||
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName2);
|
||||
|
||||
sampleOverlapMap.put(newSample, 2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges various vcf records into a single one using the mapping from rodNamesToSampleNames to get unique sample names
|
||||
*
|
||||
|
|
|
|||
Loading…
Reference in New Issue