2010-06-29 07:54:38 +08:00
|
|
|
package org.broadinstitute.sting.oneoffprojects.walkers;
|
|
|
|
|
|
2010-07-19 12:49:27 +08:00
|
|
|
import org.broad.tribble.readers.AsciiLineReader;
|
2010-08-06 02:47:53 +08:00
|
|
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
2010-06-29 07:54:38 +08:00
|
|
|
import org.broad.tribble.vcf.*;
|
|
|
|
|
import org.broadinstitute.sting.commandline.Argument;
|
|
|
|
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
|
|
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
|
|
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
|
|
|
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
|
|
|
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
|
|
|
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
2010-07-14 12:56:58 +08:00
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
|
|
|
import org.broadinstitute.sting.utils.StingException;
|
|
|
|
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
|
|
|
|
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
2010-06-29 07:54:38 +08:00
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2009 The Broad Institute
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person
|
|
|
|
|
* obtaining a copy of this software and associated documentation
|
|
|
|
|
* files (the "Software"), to deal in the Software without
|
|
|
|
|
* restriction, including without limitation the rights to use,
|
|
|
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
* copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following
|
|
|
|
|
* conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
|
* included in all copies or substantial portions of the Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
|
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
|
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
|
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
|
|
|
|
|
* of the given ROD.
|
|
|
|
|
*/
|
|
|
|
|
public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
|
|
|
|
private VCFWriter vcfWriter;
|
|
|
|
|
|
|
|
|
|
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
|
|
|
|
|
private String OUTPUT_FILE = null;
|
|
|
|
|
|
|
|
|
|
|
2010-07-22 10:36:45 +08:00
|
|
|
public static final String INPUT_ROD_NAME = "variant";
|
2010-06-29 07:54:38 +08:00
|
|
|
|
|
|
|
|
protected static String line = null;
|
|
|
|
|
final TreeSet<String> samples = new TreeSet<String>();
|
First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version.
b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted.
c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format.
d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved.
e) Several VCF 4.0 writer bugs are now solved.
f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0.
g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension.
Pending issues:
a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes.
b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
|
|
|
VCFCodec vcf4codec = new VCFCodec();
|
2010-06-29 07:54:38 +08:00
|
|
|
|
2010-07-02 23:19:42 +08:00
|
|
|
|
2010-06-29 07:54:38 +08:00
|
|
|
/**
|
|
|
|
|
* Initialize the number of loci processed to zero.
|
|
|
|
|
*
|
|
|
|
|
* @return 0
|
|
|
|
|
*/
|
|
|
|
|
public Integer reduceInit() { return 0; }
|
|
|
|
|
|
|
|
|
|
public void initialize() {
|
|
|
|
|
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
|
|
|
|
|
2010-07-02 23:19:42 +08:00
|
|
|
|
2010-06-29 07:54:38 +08:00
|
|
|
// Open output file specified by output VCF ROD
|
2010-07-02 23:19:42 +08:00
|
|
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
|
|
|
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
|
|
|
|
|
2010-06-29 07:54:38 +08:00
|
|
|
|
2010-07-14 12:56:58 +08:00
|
|
|
vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
|
2010-07-02 23:19:42 +08:00
|
|
|
VCFHeader header = null;
|
2010-06-29 07:54:38 +08:00
|
|
|
for( final ReferenceOrderedDataSource source : dataSources ) {
|
|
|
|
|
final RMDTrack rod = source.getReferenceOrderedData();
|
2010-07-02 23:19:42 +08:00
|
|
|
if(rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
|
First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version.
b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted.
c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format.
d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved.
e) Several VCF 4.0 writer bugs are now solved.
f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0.
g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension.
Pending issues:
a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes.
b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
|
2010-07-19 12:49:27 +08:00
|
|
|
header = (VCFHeader)vcf4codec.readHeader(lineReader);
|
|
|
|
|
out.printf("Read %d header lines%n", header.getMetaData().size());
|
First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version.
b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted.
c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format.
d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved.
e) Several VCF 4.0 writer bugs are now solved.
f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0.
g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension.
Pending issues:
a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes.
b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
|
|
|
}
|
|
|
|
|
catch (FileNotFoundException e ) {
|
|
|
|
|
throw new StingException(e.getMessage());
|
2010-06-29 07:54:38 +08:00
|
|
|
}
|
|
|
|
|
|
First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version.
b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted.
c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format.
d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved.
e) Several VCF 4.0 writer bugs are now solved.
f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0.
g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension.
Pending issues:
a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes.
b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
|
|
|
final Set<String> vcfSamples = header.getGenotypeSamples();
|
|
|
|
|
samples.addAll(vcfSamples);
|
2010-07-22 10:36:45 +08:00
|
|
|
vcfWriter.writeHeader(header);
|
|
|
|
|
|
First step in major cleanup/redo of VCF functionality. Specifically, now:
a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version.
b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted.
c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format.
d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved.
e) Several VCF 4.0 writer bugs are now solved.
f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0.
g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension.
Pending issues:
a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes.
b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
|
|
|
|
2010-07-02 23:19:42 +08:00
|
|
|
}
|
2010-06-29 07:54:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
*
|
|
|
|
|
* @param tracker the meta-data tracker
|
|
|
|
|
* @param ref the reference base
|
|
|
|
|
* @param context the context for the given locus
|
|
|
|
|
* @return 1 if the locus was successfully processed, 0 if otherwise
|
|
|
|
|
*/
|
|
|
|
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
|
|
|
|
if ( tracker == null )
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
GenomeLoc loc = context.getLocation();
|
2010-07-22 10:36:45 +08:00
|
|
|
VariantContext vc = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, true);
|
2010-06-29 07:54:38 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if (vc == null)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
// Write directly variant context to VCF4.0 format.
|
2010-07-24 04:24:03 +08:00
|
|
|
vcfWriter.add(vc, ref.getBase());
|
2010-06-29 07:54:38 +08:00
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Increment the number of rods processed.
|
|
|
|
|
*
|
|
|
|
|
* @param value result of the map.
|
|
|
|
|
* @param sum accumulator for the reduce.
|
|
|
|
|
* @return the new number of rods processed.
|
|
|
|
|
*/
|
|
|
|
|
public Integer reduce(Integer value, Integer sum) {
|
|
|
|
|
return sum + value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void onTraversalDone(Integer result) {}
|
|
|
|
|
}
|