Pulled sample/header merging routines out of CombineVariants and into util classes. Added more generalized methods for retrieving samples. Updated the Beagle walkers to use these methods.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3764 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0c4a32843c
commit
8086ab1f75
|
|
@ -68,81 +68,17 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
vcfWriter = new VCFWriter(out, true);
|
||||
validateAnnotateUnionArguments();
|
||||
|
||||
// todo -- need to merge headers in an intelligent way
|
||||
Map<String, VCFHeader> vcfRods = SampleUtils.getVCFHeadersFromRods(getToolkit(), null);
|
||||
Set<String> samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
|
||||
|
||||
Map<String, VCFHeader> vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null);
|
||||
Set<String> samples = getSampleList(vcfRods, genotypeMergeOption);
|
||||
|
||||
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
|
||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
||||
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
|
||||
headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0));
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
}
|
||||
|
||||
// todo -- Eric, where's a better place to put this?
|
||||
public static Set<String> getSampleList(Map<String, VCFHeader> headers, VariantContextUtils.GenotypeMergeType mergeOption ) {
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
|
||||
VCFHeader header = val.getValue();
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY));
|
||||
}
|
||||
}
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
// todo -- Eric, where's a better place to put this?
|
||||
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers) throws IllegalStateException {
|
||||
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
|
||||
HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// todo -- needs to remove all version headers from sources and add its own VCF version line
|
||||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||
String key = line.getKey();
|
||||
if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getName();
|
||||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
if ( line.equals(other) )
|
||||
continue;
|
||||
// System.out.printf("equals duplicate key %s%n", line);
|
||||
else if ( ! line.getClass().equals(other.getClass()) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
else if ( line instanceof VCFFilterHeaderLine ) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getName();
|
||||
String otherName = ((VCFFilterHeaderLine) other).getName();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
|
||||
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
|
||||
|
||||
// if the names are the same, but the values are different, we need to quit
|
||||
if (! (compLine).equalsExcludingDescription(compOther) )
|
||||
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
||||
if ( ! compLine.getDescription().equals(compOther) )
|
||||
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
|
||||
} else {
|
||||
// we are not equal, but we're not anything special either
|
||||
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
||||
}
|
||||
} else {
|
||||
line.setVersion(VCFHeaderVersion.VCF4_0);
|
||||
map.put(key, line);
|
||||
//System.out.printf("Adding header line %s%n", line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new HashSet<VCFHeaderLine>(map.values());
|
||||
}
|
||||
|
||||
|
||||
private void validateAnnotateUnionArguments() {
|
||||
Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
|
||||
Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
|
||||
|
||||
if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null )
|
||||
throw new StingException("Priority string must be provided if you want to prioritize genotypes");
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
import org.broad.tribble.vcf.*;
|
||||
|
|
@ -65,13 +66,10 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
// protected HashMap<String,BeagleSampleRecord> beagleSampleRecords;
|
||||
|
||||
TreeSet<String> samples = null;
|
||||
|
||||
|
||||
private final double MIN_PROB_ERROR = 0.000001;
|
||||
private final double MAX_GENOTYPE_QUALITY = 6.0;
|
||||
|
||||
private void initialize(Set<String> sampleNames) {
|
||||
public void initialize() {
|
||||
|
||||
// setup the header fields
|
||||
|
||||
|
|
@ -86,13 +84,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
// Open output file specified by output VCF ROD
|
||||
vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
|
||||
|
||||
samples = new TreeSet<String>(sampleNames);
|
||||
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME));
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if ( tracker == null )
|
||||
|
|
@ -103,10 +99,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
if ( vc_input == null )
|
||||
return 0;
|
||||
|
||||
if ( samples == null ) {
|
||||
initialize(vc_input.getSampleNames());
|
||||
}
|
||||
|
||||
List<Object> r2rods = tracker.getReferenceMetaData("beagleR2");
|
||||
|
||||
// ignore places where we don't have a variant
|
||||
|
|
@ -267,7 +259,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
|
|||
vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()});
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.walkers.RMD;
|
|||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -58,13 +59,15 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
|
||||
public double insertedNoCallRate = 0;
|
||||
|
||||
private TreeSet<String> samples = null;
|
||||
private Set<String> samples = null;
|
||||
private Random generator;
|
||||
|
||||
private void initialize(Set<String> sampleNames) {
|
||||
public void initialize() {
|
||||
generator = new Random();
|
||||
|
||||
samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME));
|
||||
|
||||
beagleWriter.print("marker alleleA alleleB");
|
||||
samples = new TreeSet<String>(sampleNames);
|
||||
for ( String sample : samples )
|
||||
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
|
||||
|
||||
|
|
@ -83,10 +86,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
if ( vc_eval == null || vc_eval.isFiltered() )
|
||||
return 0;
|
||||
|
||||
if ( samples == null ) {
|
||||
initialize(vc_eval.getSampleNames());
|
||||
}
|
||||
|
||||
// output marker ID to Beagle input file
|
||||
beagleWriter.print(String.format("%s ", vc_eval.getLocation().toString()));
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -83,20 +84,20 @@ public class SampleUtils {
|
|||
*
|
||||
* @return the set of unique samples
|
||||
*/
|
||||
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
|
||||
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
|
||||
for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() )
|
||||
for ( VCFHeader header : getVCFHeadersFromRods(toolkit, rodNames).values() )
|
||||
samples.addAll(header.getGenotypeSamples());
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
public static Set<String> getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
|
||||
return getRodsWithVCFHeader(toolkit, rodNames).keySet();
|
||||
public static Set<String> getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
|
||||
return getVCFHeadersFromRods(toolkit, rodNames).keySet();
|
||||
}
|
||||
|
||||
public static Map<String, VCFHeader> getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
|
||||
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
|
||||
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
|
|
@ -123,6 +124,25 @@ public class SampleUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static Set<String> getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
|
||||
return getSampleList(SampleUtils.getVCFHeadersFromRods(toolkit, rodNames));
|
||||
}
|
||||
|
||||
public static Set<String> getSampleList(Map<String, VCFHeader> headers) {
|
||||
return getSampleList(headers, VariantContextUtils.GenotypeMergeType.PRIORITIZE);
|
||||
}
|
||||
|
||||
public static Set<String> getSampleList(Map<String, VCFHeader> headers, VariantContextUtils.GenotypeMergeType mergeOption) {
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
|
||||
VCFHeader header = val.getValue();
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY));
|
||||
}
|
||||
}
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
|
|
@ -141,7 +161,7 @@ public class SampleUtils {
|
|||
|
||||
// iterate to get all of the sample names
|
||||
|
||||
for ( Map.Entry<String, VCFHeader> pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) {
|
||||
for ( Map.Entry<String, VCFHeader> pair : getVCFHeadersFromRods(toolkit, null).entrySet() ) {
|
||||
Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
|
||||
for ( String sample : vcfSamples )
|
||||
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -136,6 +137,53 @@ public class VCFUtils {
|
|||
params.getGenotypeRecords());
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers, Logger logger) throws IllegalStateException {
|
||||
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
|
||||
HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// todo -- needs to remove all version headers from sources and add its own VCF version line
|
||||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||
String key = line.getKey();
|
||||
if ( line instanceof VCFNamedHeaderLine )
|
||||
key = key + "." + ((VCFNamedHeaderLine) line).getName();
|
||||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
if ( line.equals(other) )
|
||||
continue;
|
||||
else if ( ! line.getClass().equals(other.getClass()) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
else if ( line instanceof VCFFilterHeaderLine ) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getName();
|
||||
String otherName = ((VCFFilterHeaderLine) other).getName();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
|
||||
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
|
||||
|
||||
// if the names are the same, but the values are different, we need to quit
|
||||
if (! (compLine).equalsExcludingDescription(compOther) )
|
||||
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
||||
if ( ! compLine.getDescription().equals(compOther) )
|
||||
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
|
||||
} else {
|
||||
// we are not equal, but we're not anything special either
|
||||
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
|
||||
}
|
||||
} else {
|
||||
line.setVersion(VCFHeaderVersion.VCF4_0);
|
||||
map.put(key, line);
|
||||
//System.out.printf("Adding header line %s%n", line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new HashSet<VCFHeaderLine>(map.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* create the VCF genotype record
|
||||
*
|
||||
|
|
|
|||
Loading…
Reference in New Issue