Pulled sample/header merging routines out of CombineVariants and into util classes. Added more generalized methods for retrieving samples. Updated the Beagle walkers to use these methods.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3764 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-07-12 16:51:54 +00:00
parent 0c4a32843c
commit 8086ab1f75
5 changed files with 223 additions and 229 deletions

View File

@ -68,81 +68,17 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
vcfWriter = new VCFWriter(out, true);
validateAnnotateUnionArguments();
// todo -- need to merge headers in an intelligent way
Map<String, VCFHeader> vcfRods = SampleUtils.getVCFHeadersFromRods(getToolkit(), null);
Set<String> samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
Map<String, VCFHeader> vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null);
Set<String> samples = getSampleList(vcfRods, genotypeMergeOption);
Set<VCFHeaderLine> headerLines = smartMergeHeaders(vcfRods.values());
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "CombineVariants"));
headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
}
// todo -- Eric, where's a better place to put this?
public static Set<String> getSampleList(Map<String, VCFHeader> headers, VariantContextUtils.GenotypeMergeType mergeOption ) {
Set<String> samples = new TreeSet<String>();
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
VCFHeader header = val.getValue();
for ( String sample : header.getGenotypeSamples() ) {
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY));
}
}
return samples;
}
// todo -- Eric, where's a better place to put this?
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers) throws IllegalStateException {
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
// todo -- needs to remove all version headers from sources and add its own VCF version line
for ( VCFHeader source : headers ) {
//System.out.printf("Merging in header %s%n", source);
for ( VCFHeaderLine line : source.getMetaData()) {
String key = line.getKey();
if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getName();
if ( map.containsKey(key) ) {
VCFHeaderLine other = map.get(key);
if ( line.equals(other) )
continue;
// System.out.printf("equals duplicate key %s%n", line);
else if ( ! line.getClass().equals(other.getClass()) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
else if ( line instanceof VCFFilterHeaderLine ) {
String lineName = ((VCFFilterHeaderLine) line).getName();
String otherName = ((VCFFilterHeaderLine) other).getName();
if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else if ( line instanceof VCFCompoundHeaderLine ) {
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
// if the names are the same, but the values are different, we need to quit
if (! (compLine).equalsExcludingDescription(compOther) )
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
if ( ! compLine.getDescription().equals(compOther) )
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
} else {
// we are not equal, but we're not anything special either
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
}
} else {
line.setVersion(VCFHeaderVersion.VCF4_0);
map.put(key, line);
//System.out.printf("Adding header line %s%n", line);
}
}
}
return new HashSet<VCFHeaderLine>(map.values());
}
private void validateAnnotateUnionArguments() {
Set<String> rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null);
Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null )
throw new StingException("Priority string must be provided if you want to prioritize genotypes");

View File

@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broad.tribble.vcf.*;
@ -65,13 +66,10 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
// protected HashMap<String,BeagleSampleRecord> beagleSampleRecords;
TreeSet<String> samples = null;
private final double MIN_PROB_ERROR = 0.000001;
private final double MAX_GENOTYPE_QUALITY = 6.0;
private void initialize(Set<String> sampleNames) {
public void initialize() {
// setup the header fields
@ -86,188 +84,181 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
// Open output file specified by output VCF ROD
vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
samples = new TreeSet<String>(sampleNames);
Set<String> samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME));
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if ( tracker == null )
return 0;
if ( tracker == null )
return 0;
GenomeLoc loc = context.getLocation();
VariantContext vc_input = tracker.getVariantContext(ref,"inputvcf", null, loc, false);
if ( vc_input == null )
return 0;
GenomeLoc loc = context.getLocation();
VariantContext vc_input = tracker.getVariantContext(ref,"inputvcf", null, loc, false);
if ( vc_input == null )
return 0;
List<Object> r2rods = tracker.getReferenceMetaData("beagleR2");
if ( samples == null ) {
initialize(vc_input.getSampleNames());
}
// ignore places where we don't have a variant
if ( r2rods.size() == 0 )
return 0;
List<Object> r2rods = tracker.getReferenceMetaData("beagleR2");
BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0);
// ignore places where we don't have a variant
if ( r2rods.size() == 0 )
return 0;
List<Object> gProbsrods = tracker.getReferenceMetaData("beagleProbs");
BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0);
// ignore places where we don't have a variant
if ( gProbsrods.size() == 0 )
return 0;
List<Object> gProbsrods = tracker.getReferenceMetaData("beagleProbs");
BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0);
// ignore places where we don't have a variant
if ( gProbsrods.size() == 0 )
return 0;
List<Object> gPhasedrods = tracker.getReferenceMetaData("beaglePhased");
BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0);
// ignore places where we don't have a variant
if ( gPhasedrods.size() == 0 )
return 0;
List<Object> gPhasedrods = tracker.getReferenceMetaData("beaglePhased");
BeagleFeature beaglePhasedFeature = (BeagleFeature)gPhasedrods.get(0);
// ignore places where we don't have a variant
if ( gPhasedrods.size() == 0 )
return 0;
// get reference base for current position
byte refByte = ref.getBase();
BeagleFeature beaglePhasedFeature = (BeagleFeature)gPhasedrods.get(0);
// get reference base for current position
byte refByte = ref.getBase();
// make new Genotypes based on Beagle results
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc_input.getGenotypes().size());
// make new Genotypes based on Beagle results
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc_input.getGenotypes().size());
// for each genotype, create a new object with Beagle information on it
boolean genotypesChangedByBeagle = false;
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
// for each genotype, create a new object with Beagle information on it
boolean genotypesChangedByBeagle = false;
for ( Map.Entry<String, Genotype> originalGenotypes : vc_input.getGenotypes().entrySet() ) {
Genotype g = originalGenotypes.getValue();
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
Genotype g = originalGenotypes.getValue();
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
boolean genotypeIsPhased = true;
String sample = g.getSampleName();
boolean genotypeIsPhased = true;
String sample = g.getSampleName();
ArrayList<String> beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample);
ArrayList<String> beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample);
ArrayList<String> beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample);
ArrayList<String> beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample);
Allele originalAlleleA = g.getAllele(0);
Allele originalAlleleB = g.getAllele(1);
Allele originalAlleleA = g.getAllele(0);
Allele originalAlleleB = g.getAllele(1);
// We have phased genotype in hp. Need to set the isRef field in the allele.
List<Allele> alleles = new ArrayList<Allele>();
// We have phased genotype in hp. Need to set the isRef field in the allele.
List<Allele> alleles = new ArrayList<Allele>();
String alleleA = beagleGenotypePairs.get(0);
String alleleB = beagleGenotypePairs.get(1);
String alleleA = beagleGenotypePairs.get(0);
String alleleB = beagleGenotypePairs.get(1);
byte[] r = alleleA.getBytes();
byte rA = r[0];
byte[] r = alleleA.getBytes();
byte rA = r[0];
Boolean isRefA = (refByte == rA);
Boolean isRefA = (refByte == rA);
Allele refAllele = Allele.create(r, isRefA );
alleles.add(refAllele);
Allele refAllele = Allele.create(r, isRefA );
alleles.add(refAllele);
r = alleleB.getBytes();
byte rB = r[0];
r = alleleB.getBytes();
byte rB = r[0];
Boolean isRefB = (refByte == rB);
Allele altAllele = Allele.create(r,isRefB);
alleles.add(altAllele);
Boolean isRefB = (refByte == rB);
Allele altAllele = Allele.create(r,isRefB);
alleles.add(altAllele);
// Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB.
// Which, by definition, are prob of hom ref, het and hom var.
Double probWrongGenotype, genotypeQuality;
Double homRefProbability = Double.valueOf(beagleProbabilities.get(0));
Double hetProbability = Double.valueOf(beagleProbabilities.get(1));
Double homVarProbability = Double.valueOf(beagleProbabilities.get(2));
// Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB.
// Which, by definition, are prob of hom ref, het and hom var.
Double probWrongGenotype, genotypeQuality;
Double homRefProbability = Double.valueOf(beagleProbabilities.get(0));
Double hetProbability = Double.valueOf(beagleProbabilities.get(1));
Double homVarProbability = Double.valueOf(beagleProbabilities.get(2));
if (isRefA && isRefB) // HomRef call
probWrongGenotype = hetProbability + homVarProbability;
else if ((isRefB && !isRefA) || (isRefA && !isRefB))
probWrongGenotype = homRefProbability + homVarProbability;
else // HomVar call
probWrongGenotype = hetProbability + homRefProbability;
if (isRefA && isRefB) // HomRef call
probWrongGenotype = hetProbability + homVarProbability;
else if ((isRefB && !isRefA) || (isRefA && !isRefB))
probWrongGenotype = homRefProbability + homVarProbability;
else // HomVar call
probWrongGenotype = hetProbability + homRefProbability;
if (1-probWrongGenotype < noCallThreshold) {
// quality is bad: don't call genotype
alleles.clear();
refAllele = Allele.NO_CALL;
altAllele = Allele.NO_CALL;
alleles.add(refAllele);
alleles.add(altAllele);
genotypeIsPhased = false;
}
if (probWrongGenotype < MIN_PROB_ERROR)
genotypeQuality = MAX_GENOTYPE_QUALITY;
else
genotypeQuality = -log10(probWrongGenotype);
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
// get original encoding and add to keynotype attributes
String a1, a2, og;
if (originalAlleleA.isNoCall())
a1 = ".";
else if (originalAlleleA.isReference())
a1 = "0";
else
a1 = "1";
if (originalAlleleB.isNoCall())
a2 = ".";
else if (originalAlleleB.isReference())
a2 = "0";
else
a2 = "1";
og = a1+"/"+a2;
// See if Beagle switched genotypes
if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) ||
(refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){
genotypesChangedByBeagle = true;
originalAttributes.put("OG",og);
}
else {
originalAttributes.put("OG",".");
}
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
}
VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes());
Set<Allele> altAlleles = filteredVC.getAlternateAlleles();
StringBuffer altAlleleCountString = new StringBuffer();
for ( Allele allele : altAlleles ) {
if ( altAlleleCountString.length() > 0 )
altAlleleCountString.append(",");
altAlleleCountString.append(filteredVC.getChromosomeCount(allele));
}
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
if ( filteredVC.getChromosomeCount() > 0 ) {
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount()));
if ( altAlleleCountString.length() > 0 ) {
attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString());
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f",
Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount())));
if (1-probWrongGenotype < noCallThreshold) {
// quality is bad: don't call genotype
alleles.clear();
refAllele = Allele.NO_CALL;
altAllele = Allele.NO_CALL;
alleles.add(refAllele);
alleles.add(altAllele);
genotypeIsPhased = false;
}
}
attributes.put("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" );
attributes.put("R2", beagleR2Feature.getR2value().toString() );
if (probWrongGenotype < MIN_PROB_ERROR)
genotypeQuality = MAX_GENOTYPE_QUALITY;
else
genotypeQuality = -log10(probWrongGenotype);
vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()});
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
return 1;
// get original encoding and add to keynotype attributes
String a1, a2, og;
if (originalAlleleA.isNoCall())
a1 = ".";
else if (originalAlleleA.isReference())
a1 = "0";
else
a1 = "1";
if (originalAlleleB.isNoCall())
a2 = ".";
else if (originalAlleleB.isReference())
a2 = "0";
else
a2 = "1";
og = a1+"/"+a2;
// See if Beagle switched genotypes
if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) ||
(refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){
genotypesChangedByBeagle = true;
originalAttributes.put("OG",og);
}
else {
originalAttributes.put("OG",".");
}
Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
genotypes.put(originalGenotypes.getKey(), imputedGenotype);
}
VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes());
Set<Allele> altAlleles = filteredVC.getAlternateAlleles();
StringBuffer altAlleleCountString = new StringBuffer();
for ( Allele allele : altAlleles ) {
if ( altAlleleCountString.length() > 0 )
altAlleleCountString.append(",");
altAlleleCountString.append(filteredVC.getChromosomeCount(allele));
}
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
if ( filteredVC.getChromosomeCount() > 0 ) {
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount()));
if ( altAlleleCountString.length() > 0 ) {
attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString());
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f",
Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount())));
}
}
attributes.put("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" );
attributes.put("R2", beagleR2Feature.getR2value().toString() );
vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()});
return 1;
}
public Integer reduceInit() {

View File

@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import java.io.PrintStream;
import java.util.*;
@ -58,13 +59,15 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
private TreeSet<String> samples = null;
private Set<String> samples = null;
private Random generator;
private void initialize(Set<String> sampleNames) {
public void initialize() {
generator = new Random();
samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME));
beagleWriter.print("marker alleleA alleleB");
samples = new TreeSet<String>(sampleNames);
for ( String sample : samples )
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
@ -83,10 +86,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
if ( vc_eval == null || vc_eval.isFiltered() )
return 0;
if ( samples == null ) {
initialize(vc_eval.getSampleNames());
}
// output marker ID to Beagle input file
beagleWriter.print(String.format("%s ", vc_eval.getLocation().toString()));

View File

@ -29,6 +29,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broad.tribble.vcf.VCFHeader;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.utils.collections.Pair;
@ -83,20 +84,20 @@ public class SampleUtils {
*
* @return the set of unique samples
*/
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
Set<String> samples = new TreeSet<String>();
for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() )
for ( VCFHeader header : getVCFHeadersFromRods(toolkit, rodNames).values() )
samples.addAll(header.getGenotypeSamples());
return samples;
}
public static Set<String> getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
return getRodsWithVCFHeader(toolkit, rodNames).keySet();
public static Set<String> getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
return getVCFHeadersFromRods(toolkit, rodNames).keySet();
}
public static Map<String, VCFHeader> getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set<String> rodNames) {
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
// iterate to get all of the sample names
@ -123,6 +124,25 @@ public class SampleUtils {
}
}
public static Set<String> getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
return getSampleList(SampleUtils.getVCFHeadersFromRods(toolkit, rodNames));
}
public static Set<String> getSampleList(Map<String, VCFHeader> headers) {
return getSampleList(headers, VariantContextUtils.GenotypeMergeType.PRIORITIZE);
}
public static Set<String> getSampleList(Map<String, VCFHeader> headers, VariantContextUtils.GenotypeMergeType mergeOption) {
Set<String> samples = new TreeSet<String>();
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
VCFHeader header = val.getValue();
for ( String sample : header.getGenotypeSamples() ) {
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY));
}
}
return samples;
}
/**
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
@ -141,7 +161,7 @@ public class SampleUtils {
// iterate to get all of the sample names
for ( Map.Entry<String, VCFHeader> pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) {
for ( Map.Entry<String, VCFHeader> pair : getVCFHeadersFromRods(toolkit, null).entrySet() ) {
Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
for ( String sample : vcfSamples )
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.Utils;
import org.apache.log4j.Logger;
import java.util.*;
@ -136,6 +137,53 @@ public class VCFUtils {
params.getGenotypeRecords());
}
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers, Logger logger) throws IllegalStateException {
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
HashSet<VCFHeaderLine> lines = new HashSet<VCFHeaderLine>();
// todo -- needs to remove all version headers from sources and add its own VCF version line
for ( VCFHeader source : headers ) {
//System.out.printf("Merging in header %s%n", source);
for ( VCFHeaderLine line : source.getMetaData()) {
String key = line.getKey();
if ( line instanceof VCFNamedHeaderLine )
key = key + "." + ((VCFNamedHeaderLine) line).getName();
if ( map.containsKey(key) ) {
VCFHeaderLine other = map.get(key);
if ( line.equals(other) )
continue;
else if ( ! line.getClass().equals(other.getClass()) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
else if ( line instanceof VCFFilterHeaderLine ) {
String lineName = ((VCFFilterHeaderLine) line).getName();
String otherName = ((VCFFilterHeaderLine) other).getName();
if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else if ( line instanceof VCFCompoundHeaderLine ) {
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
// if the names are the same, but the values are different, we need to quit
if (! (compLine).equalsExcludingDescription(compOther) )
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
if ( ! compLine.getDescription().equals(compOther) )
logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine));
} else {
// we are not equal, but we're not anything special either
logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other));
}
} else {
line.setVersion(VCFHeaderVersion.VCF4_0);
map.put(key, line);
//System.out.printf("Adding header line %s%n", line);
}
}
}
return new HashSet<VCFHeaderLine>(map.values());
}
/**
* create the VCF genotype record
*