diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 897c0f2fb..7e40741d2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -68,81 +68,17 @@ public class CombineVariants extends RodWalker { vcfWriter = new VCFWriter(out, true); validateAnnotateUnionArguments(); - // todo -- need to merge headers in an intelligent way + Map vcfRods = SampleUtils.getVCFHeadersFromRods(getToolkit(), null); + Set samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption); - Map vcfRods = SampleUtils.getRodsWithVCFHeader(getToolkit(), null); - Set samples = getSampleList(vcfRods, genotypeMergeOption); - - Set headerLines = smartMergeHeaders(vcfRods.values()); + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "CombineVariants")); headerLines.add(new VCFInfoHeaderLine("set", 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants", VCFHeaderVersion.VCF4_0)); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); } - // todo -- Eric, where's a better place to put this? - public static Set getSampleList(Map headers, VariantContextUtils.GenotypeMergeType mergeOption ) { - Set samples = new TreeSet(); - for ( Map.Entry val : headers.entrySet() ) { - VCFHeader header = val.getValue(); - for ( String sample : header.getGenotypeSamples() ) { - samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY)); - } - } - - return samples; - } - - // todo -- Eric, where's a better place to put this? - public static Set smartMergeHeaders(Collection headers) throws IllegalStateException { - HashMap map = new HashMap(); // from KEY.NAME -> line - HashSet lines = new HashSet(); - - // todo -- needs to remove all version headers from sources and add its own VCF version line - for ( VCFHeader source : headers ) { - //System.out.printf("Merging in header %s%n", source); - for ( VCFHeaderLine line : source.getMetaData()) { - String key = line.getKey(); - if ( line instanceof VCFNamedHeaderLine ) key = key + "." + ((VCFNamedHeaderLine) line).getName(); - - if ( map.containsKey(key) ) { - VCFHeaderLine other = map.get(key); - if ( line.equals(other) ) - continue; -// System.out.printf("equals duplicate key %s%n", line); - else if ( ! line.getClass().equals(other.getClass()) ) - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - else if ( line instanceof VCFFilterHeaderLine ) { - String lineName = ((VCFFilterHeaderLine) line).getName(); - String otherName = ((VCFFilterHeaderLine) other).getName(); - if ( ! lineName.equals(otherName) ) - throw new IllegalStateException("Incompatible header types: " + line + " " + other ); - } else if ( line instanceof VCFCompoundHeaderLine ) { - VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line; - VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other; - - // if the names are the same, but the values are different, we need to quit - if (! (compLine).equalsExcludingDescription(compOther) ) - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); - if ( ! compLine.getDescription().equals(compOther) ) - logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine)); - } else { - // we are not equal, but we're not anything special either - logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other)); - } - } else { - line.setVersion(VCFHeaderVersion.VCF4_0); - map.put(key, line); - //System.out.printf("Adding header line %s%n", line); - } - } - } - - return new HashSet(map.values()); - } - - private void validateAnnotateUnionArguments() { - Set rodNames = SampleUtils.getRodsNamesWithVCFHeader(getToolkit(), null); + Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new StingException("Priority string must be provided if you want to prioritize genotypes"); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java index decf5682d..6d37b48d5 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; import org.broad.tribble.vcf.*; @@ -65,13 +66,10 @@ public class BeagleOutputToVCFWalker extends RodWalker { // protected HashMap beagleSampleRecords; - TreeSet samples = null; - - private final double MIN_PROB_ERROR = 0.000001; private final double MAX_GENOTYPE_QUALITY = 6.0; - private void initialize(Set sampleNames) { + public void initialize() { // setup the header fields @@ -86,188 +84,181 @@ public class BeagleOutputToVCFWalker extends RodWalker { // Open output file specified by output VCF ROD vcfWriter = new VCFWriter(new File(OUTPUT_FILE)); - samples = new TreeSet(sampleNames); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME)); final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); - } + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + if ( tracker == null ) + return 0; - if ( tracker == null ) - return 0; + GenomeLoc loc = context.getLocation(); + VariantContext vc_input = tracker.getVariantContext(ref,"inputvcf", null, loc, false); + if ( vc_input == null ) + return 0; - GenomeLoc loc = context.getLocation(); - VariantContext vc_input = tracker.getVariantContext(ref,"inputvcf", null, loc, false); - if ( vc_input == null ) - return 0; + List r2rods = tracker.getReferenceMetaData("beagleR2"); - if ( samples == null ) { - initialize(vc_input.getSampleNames()); - } + // ignore places where we don't have a variant + if ( r2rods.size() == 0 ) + return 0; - List r2rods = tracker.getReferenceMetaData("beagleR2"); + BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0); - // ignore places where we don't have a variant - if ( r2rods.size() == 0 ) - return 0; + List gProbsrods = tracker.getReferenceMetaData("beagleProbs"); - BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0); + // ignore places where we don't have a variant + if ( gProbsrods.size() == 0 ) + return 0; - List gProbsrods = tracker.getReferenceMetaData("beagleProbs"); + BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0); - // ignore places where we don't have a variant - if ( gProbsrods.size() == 0 ) - return 0; + List gPhasedrods = tracker.getReferenceMetaData("beaglePhased"); - BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0); + // ignore places where we don't have a variant + if ( gPhasedrods.size() == 0 ) + return 0; - List gPhasedrods = tracker.getReferenceMetaData("beaglePhased"); + BeagleFeature beaglePhasedFeature = (BeagleFeature)gPhasedrods.get(0); - // ignore places where we don't have a variant - if ( gPhasedrods.size() == 0 ) - return 0; + // get reference base for current position + byte refByte = ref.getBase(); - BeagleFeature beaglePhasedFeature = (BeagleFeature)gPhasedrods.get(0); - - // get reference base for current position - byte refByte = ref.getBase(); - - // make new Genotypes based on Beagle results - Map genotypes = new HashMap(vc_input.getGenotypes().size()); + // make new Genotypes based on Beagle results + Map genotypes = new HashMap(vc_input.getGenotypes().size()); - // for each genotype, create a new object with Beagle information on it - boolean genotypesChangedByBeagle = false; - for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { + // for each genotype, create a new object with Beagle information on it + boolean genotypesChangedByBeagle = false; + for ( Map.Entry originalGenotypes : vc_input.getGenotypes().entrySet() ) { - Genotype g = originalGenotypes.getValue(); - Set filters = new LinkedHashSet(g.getFilters()); + Genotype g = originalGenotypes.getValue(); + Set filters = new LinkedHashSet(g.getFilters()); - boolean genotypeIsPhased = true; - String sample = g.getSampleName(); + boolean genotypeIsPhased = true; + String sample = g.getSampleName(); - ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); - ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); + ArrayList beagleProbabilities = beagleProbsFeature.getProbLikelihoods().get(sample); + ArrayList beagleGenotypePairs = beaglePhasedFeature.getGenotypes().get(sample); - Allele originalAlleleA = g.getAllele(0); - Allele originalAlleleB = g.getAllele(1); + Allele originalAlleleA = g.getAllele(0); + Allele originalAlleleB = g.getAllele(1); - // We have phased genotype in hp. Need to set the isRef field in the allele. - List alleles = new ArrayList(); + // We have phased genotype in hp. Need to set the isRef field in the allele. + List alleles = new ArrayList(); - String alleleA = beagleGenotypePairs.get(0); - String alleleB = beagleGenotypePairs.get(1); + String alleleA = beagleGenotypePairs.get(0); + String alleleB = beagleGenotypePairs.get(1); - byte[] r = alleleA.getBytes(); - byte rA = r[0]; + byte[] r = alleleA.getBytes(); + byte rA = r[0]; - Boolean isRefA = (refByte == rA); + Boolean isRefA = (refByte == rA); - Allele refAllele = Allele.create(r, isRefA ); - alleles.add(refAllele); + Allele refAllele = Allele.create(r, isRefA ); + alleles.add(refAllele); - r = alleleB.getBytes(); - byte rB = r[0]; + r = alleleB.getBytes(); + byte rB = r[0]; - Boolean isRefB = (refByte == rB); - Allele altAllele = Allele.create(r,isRefB); - alleles.add(altAllele); + Boolean isRefB = (refByte == rB); + Allele altAllele = Allele.create(r,isRefB); + alleles.add(altAllele); - // Compute new GQ field = -10*log10Pr(Genotype call is wrong) - // Beagle gives probability that genotype is AA, AB and BB. - // Which, by definition, are prob of hom ref, het and hom var. - Double probWrongGenotype, genotypeQuality; - Double homRefProbability = Double.valueOf(beagleProbabilities.get(0)); - Double hetProbability = Double.valueOf(beagleProbabilities.get(1)); - Double homVarProbability = Double.valueOf(beagleProbabilities.get(2)); + // Compute new GQ field = -10*log10Pr(Genotype call is wrong) + // Beagle gives probability that genotype is AA, AB and BB. + // Which, by definition, are prob of hom ref, het and hom var. + Double probWrongGenotype, genotypeQuality; + Double homRefProbability = Double.valueOf(beagleProbabilities.get(0)); + Double hetProbability = Double.valueOf(beagleProbabilities.get(1)); + Double homVarProbability = Double.valueOf(beagleProbabilities.get(2)); - if (isRefA && isRefB) // HomRef call - probWrongGenotype = hetProbability + homVarProbability; - else if ((isRefB && !isRefA) || (isRefA && !isRefB)) - probWrongGenotype = homRefProbability + homVarProbability; - else // HomVar call - probWrongGenotype = hetProbability + homRefProbability; + if (isRefA && isRefB) // HomRef call + probWrongGenotype = hetProbability + homVarProbability; + else if ((isRefB && !isRefA) || (isRefA && !isRefB)) + probWrongGenotype = homRefProbability + homVarProbability; + else // HomVar call + probWrongGenotype = hetProbability + homRefProbability; - if (1-probWrongGenotype < noCallThreshold) { - // quality is bad: don't call genotype - alleles.clear(); - refAllele = Allele.NO_CALL; - altAllele = Allele.NO_CALL; - alleles.add(refAllele); - alleles.add(altAllele); - genotypeIsPhased = false; - } - - if (probWrongGenotype < MIN_PROB_ERROR) - genotypeQuality = MAX_GENOTYPE_QUALITY; - else - genotypeQuality = -log10(probWrongGenotype); - - HashMap originalAttributes = new HashMap(g.getAttributes()); - - // get original encoding and add to keynotype attributes - String a1, a2, og; - if (originalAlleleA.isNoCall()) - a1 = "."; - else if (originalAlleleA.isReference()) - a1 = "0"; - else - a1 = "1"; - - if (originalAlleleB.isNoCall()) - a2 = "."; - else if (originalAlleleB.isReference()) - a2 = "0"; - else - a2 = "1"; - - og = a1+"/"+a2; - - // See if Beagle switched genotypes - if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) || - (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ - genotypesChangedByBeagle = true; - originalAttributes.put("OG",og); - } - else { - originalAttributes.put("OG","."); - } - Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); - - - genotypes.put(originalGenotypes.getKey(), imputedGenotype); - - } - - VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); - - Set altAlleles = filteredVC.getAlternateAlleles(); - StringBuffer altAlleleCountString = new StringBuffer(); - for ( Allele allele : altAlleles ) { - if ( altAlleleCountString.length() > 0 ) - altAlleleCountString.append(","); - altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); - } - - HashMap attributes = new HashMap(filteredVC.getAttributes()); - if ( filteredVC.getChromosomeCount() > 0 ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); - if ( altAlleleCountString.length() > 0 ) { - attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString()); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f", - Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); + if (1-probWrongGenotype < noCallThreshold) { + // quality is bad: don't call genotype + alleles.clear(); + refAllele = Allele.NO_CALL; + altAllele = Allele.NO_CALL; + alleles.add(refAllele); + alleles.add(altAllele); + genotypeIsPhased = false; } - } - attributes.put("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" ); - attributes.put("R2", beagleR2Feature.getR2value().toString() ); + if (probWrongGenotype < MIN_PROB_ERROR) + genotypeQuality = MAX_GENOTYPE_QUALITY; + else + genotypeQuality = -log10(probWrongGenotype); - vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()}); + HashMap originalAttributes = new HashMap(g.getAttributes()); - return 1; + // get original encoding and add to keynotype attributes + String a1, a2, og; + if (originalAlleleA.isNoCall()) + a1 = "."; + else if (originalAlleleA.isReference()) + a1 = "0"; + else + a1 = "1"; + if (originalAlleleB.isNoCall()) + a2 = "."; + else if (originalAlleleB.isReference()) + a2 = "0"; + else + a2 = "1"; + + og = a1+"/"+a2; + + // See if Beagle switched genotypes + if (!((refAllele.equals(originalAlleleA) && altAllele.equals(originalAlleleB) || + (refAllele.equals(originalAlleleB) && altAllele.equals(originalAlleleA))))){ + genotypesChangedByBeagle = true; + originalAttributes.put("OG",og); + } + else { + originalAttributes.put("OG","."); + } + Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); + + + genotypes.put(originalGenotypes.getKey(), imputedGenotype); + + } + + VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getLocation(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.getFilters(), vc_input.getAttributes()); + + Set altAlleles = filteredVC.getAlternateAlleles(); + StringBuffer altAlleleCountString = new StringBuffer(); + for ( Allele allele : altAlleles ) { + if ( altAlleleCountString.length() > 0 ) + altAlleleCountString.append(","); + altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); + } + + HashMap attributes = new HashMap(filteredVC.getAttributes()); + if ( filteredVC.getChromosomeCount() > 0 ) { + attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); + if ( altAlleleCountString.length() > 0 ) { + attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString()); + attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f", + Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); + } + } + + attributes.put("GenotypesChanged", (genotypesChangedByBeagle)? "1":"0" ); + attributes.put("R2", beagleR2Feature.getR2value().toString() ); + + vcfWriter.add(VariantContextUtils.modifyAttributes(filteredVC, attributes), new byte[]{ref.getBase()}); + + return 1; } public Integer reduceInit() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java index 93392f99d..0694e827f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ProduceBeagleInputWalker.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.SampleUtils; import java.io.PrintStream; import java.util.*; @@ -58,13 +59,15 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; - private TreeSet samples = null; + private Set samples = null; private Random generator; - private void initialize(Set sampleNames) { + public void initialize() { generator = new Random(); + + samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME)); + beagleWriter.print("marker alleleA alleleB"); - samples = new TreeSet(sampleNames); for ( String sample : samples ) beagleWriter.print(String.format(" %s %s %s", sample, sample, sample)); @@ -83,10 +86,6 @@ public class ProduceBeagleInputWalker extends RodWalker { if ( vc_eval == null || vc_eval.isFiltered() ) return 0; - if ( samples == null ) { - initialize(vc_eval.getSampleNames()); - } - // output marker ID to Beagle input file beagleWriter.print(String.format("%s ", vc_eval.getLocation().toString())); diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 7cf41cac8..9fda7fbbf 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import org.broad.tribble.vcf.VCFHeader; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.utils.collections.Pair; @@ -83,20 +84,20 @@ public class SampleUtils { * * @return the set of unique samples */ - public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Set rodNames) { + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { Set samples = new TreeSet(); - for ( VCFHeader header : getRodsWithVCFHeader(toolkit, rodNames).values() ) + for ( VCFHeader header : getVCFHeadersFromRods(toolkit, rodNames).values() ) samples.addAll(header.getGenotypeSamples()); return samples; } - public static Set getRodsNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Set rodNames) { - return getRodsWithVCFHeader(toolkit, rodNames).keySet(); + public static Set getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { + return getVCFHeadersFromRods(toolkit, rodNames).keySet(); } - public static Map getRodsWithVCFHeader(GenomeAnalysisEngine toolkit, Set rodNames) { + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { Map data = new HashMap(); // iterate to get all of the sample names @@ -123,6 +124,25 @@ public class SampleUtils { } } + public static Set getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { + return getSampleList(SampleUtils.getVCFHeadersFromRods(toolkit, rodNames)); + } + + public static Set getSampleList(Map headers) { + return getSampleList(headers, VariantContextUtils.GenotypeMergeType.PRIORITIZE); + } + + public static Set getSampleList(Map headers, VariantContextUtils.GenotypeMergeType mergeOption) { + Set samples = new TreeSet(); + for ( Map.Entry val : headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY)); + } + } + + return samples; + } /** * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap @@ -141,7 +161,7 @@ public class SampleUtils { // iterate to get all of the sample names - for ( Map.Entry pair : getRodsWithVCFHeader(toolkit, null).entrySet() ) { + for ( Map.Entry pair : getVCFHeadersFromRods(toolkit, null).entrySet() ) { Set vcfSamples = pair.getValue().getGenotypeSamples(); for ( String sample : vcfSamples ) addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java index 02095a474..470880c27 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.Utils; +import org.apache.log4j.Logger; import java.util.*; @@ -136,6 +137,53 @@ public class VCFUtils { params.getGenotypeRecords()); } + public static Set smartMergeHeaders(Collection headers, Logger logger) throws IllegalStateException { + HashMap map = new HashMap(); // from KEY.NAME -> line + HashSet lines = new HashSet(); + + // todo -- needs to remove all version headers from sources and add its own VCF version line + for ( VCFHeader source : headers ) { + //System.out.printf("Merging in header %s%n", source); + for ( VCFHeaderLine line : source.getMetaData()) { + String key = line.getKey(); + if ( line instanceof VCFNamedHeaderLine ) + key = key + "." + ((VCFNamedHeaderLine) line).getName(); + + if ( map.containsKey(key) ) { + VCFHeaderLine other = map.get(key); + if ( line.equals(other) ) + continue; + else if ( ! line.getClass().equals(other.getClass()) ) + throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + else if ( line instanceof VCFFilterHeaderLine ) { + String lineName = ((VCFFilterHeaderLine) line).getName(); + String otherName = ((VCFFilterHeaderLine) other).getName(); + if ( ! lineName.equals(otherName) ) + throw new IllegalStateException("Incompatible header types: " + line + " " + other ); + } else if ( line instanceof VCFCompoundHeaderLine ) { + VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line; + VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other; + + // if the names are the same, but the values are different, we need to quit + if (! (compLine).equalsExcludingDescription(compOther) ) + throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); + if ( ! compLine.getDescription().equals(compOther) ) + logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine)); + } else { + // we are not equal, but we're not anything special either + logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other)); + } + } else { + line.setVersion(VCFHeaderVersion.VCF4_0); + map.put(key, line); + //System.out.printf("Adding header line %s%n", line); + } + } + } + + return new HashSet(map.values()); + } + /** * create the VCF genotype record *