a fix for a bug Eric found: if your first call contains fewer samples than calls at other loci, your VCFHeader got setup incorrectly.

Also moved a buch of Lists over to Sets for consistancy.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1859 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-10-16 04:57:50 +00:00
parent a69ea9b57c
commit 96972c3a5c
9 changed files with 36 additions and 23 deletions

View File

@ -98,10 +98,12 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
if ( VARIANTS_FILE != null ) if ( VARIANTS_FILE != null )
writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE, writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE,
"UnifiedGenotyper", "UnifiedGenotyper",
this.getToolkit().getArguments().referenceFile.getName()); this.getToolkit().getArguments().referenceFile.getName(),
samples);
else else
writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper", writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper",
this.getToolkit().getArguments().referenceFile.getName()); this.getToolkit().getArguments().referenceFile.getName(),
samples);
callsMetrics = new CallMetrics(); callsMetrics = new CallMetrics();
} }

View File

@ -49,7 +49,7 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) { public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) {
Map<String, String> metaData = new HashMap<String, String>(); Map<String, String> metaData = new HashMap<String, String>();
List<String> additionalColumns = new ArrayList<String>(); Set<String> additionalColumns = new HashSet<String>();
// Don't output the data for now because it kills our unit test MD5s and is optional // Don't output the data for now because it kills our unit test MD5s and is optional
// TODO - figure out what to do here // TODO - figure out what to do here

View File

@ -9,6 +9,8 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeWriterAdapter;
import java.io.File; import java.io.File;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.List;
import java.util.Set;
/** /**
@ -31,7 +33,12 @@ public class GenotypeWriterFactory {
* @param destination the destination file * @param destination the destination file
* @return the genotype writer object * @return the genotype writer object
*/ */
public static GenotypeWriter create(GENOTYPE_FORMAT format, SAMFileHeader header, File destination, String source, String referenceName ) { public static GenotypeWriter create(GENOTYPE_FORMAT format,
SAMFileHeader header,
File destination,
String source,
String referenceName,
Set<String> sampleNames ) {
switch (format) { switch (format) {
case GLF: case GLF:
return new GLFWriter(header.toString(), destination); return new GLFWriter(header.toString(), destination);
@ -40,20 +47,25 @@ public class GenotypeWriterFactory {
case GELI_BINARY: case GELI_BINARY:
return new GeliAdapter(destination, header); return new GeliAdapter(destination, header);
case VCF: case VCF:
return new VCFGenotypeWriterAdapter(source, referenceName, destination); return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames);
default: default:
throw new StingException("Genotype writer " + format.toString() + " is not implemented"); throw new StingException("Genotype writer " + format.toString() + " is not implemented");
} }
} }
public static GenotypeWriter create(GENOTYPE_FORMAT format, SAMFileHeader header, PrintStream destination, String source, String referenceName ) { public static GenotypeWriter create(GENOTYPE_FORMAT format,
SAMFileHeader header,
PrintStream destination,
String source,
String referenceName,
Set<String> sampleNames ) {
switch (format) { switch (format) {
case GELI: case GELI:
return new GeliTextWriter(destination); return new GeliTextWriter(destination);
case GLF: case GLF:
return new GLFWriter(header.toString(), destination); return new GLFWriter(header.toString(), destination);
case VCF: case VCF:
return new VCFGenotypeWriterAdapter(source, referenceName, destination); return new VCFGenotypeWriterAdapter(source, referenceName, destination, sampleNames);
default: default:
throw new StingException("Genotype writer to " + format.toString() + " to standard output is not implemented"); throw new StingException("Genotype writer to " + format.toString() + " to standard output is not implemented");
} }

View File

@ -21,23 +21,26 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
private String mSource; private String mSource;
private String mReferenceName; private String mReferenceName;
private boolean mInitialized = false; private boolean mInitialized = false;
private final Set<String> mSampleNames = new HashSet<String>();
private final File mFile; private final File mFile;
private final OutputStream mStream; private final OutputStream mStream;
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo) { public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames) {
mReferenceName = referenceName; mReferenceName = referenceName;
mSource = source; mSource = source;
mFile = writeTo; mFile = writeTo;
if (mFile == null) throw new RuntimeException("VCF output file must not be null"); if (mFile == null) throw new RuntimeException("VCF output file must not be null");
mStream = null; mStream = null;
mSampleNames.addAll(sampleNames);
} }
public VCFGenotypeWriterAdapter(String source, String referenceName, OutputStream writeTo) { public VCFGenotypeWriterAdapter(String source, String referenceName, OutputStream writeTo, Set<String> sampleNames) {
mReferenceName = referenceName; mReferenceName = referenceName;
mSource = source; mSource = source;
mFile = null; mFile = null;
mStream = writeTo; mStream = writeTo;
if (mStream == null) throw new RuntimeException("VCF output stream must not be null"); if (mStream == null) throw new RuntimeException("VCF output stream must not be null");
mSampleNames.addAll(sampleNames);
} }
@ -49,7 +52,6 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
*/ */
private void lazyInitialize(List<Genotype> genotypes, File file, OutputStream stream) { private void lazyInitialize(List<Genotype> genotypes, File file, OutputStream stream) {
Map<String, String> hInfo = new HashMap<String, String>(); Map<String, String> hInfo = new HashMap<String, String>();
List<String> sampleNames = getSampleNames(genotypes);
// setup the header fields // setup the header fields
hInfo.put("format", "VCRv3.2"); hInfo.put("format", "VCRv3.2");
@ -57,7 +59,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
hInfo.put("reference", mReferenceName); hInfo.put("reference", mReferenceName);
// setup the sample names // setup the sample names
mHeader = new VCFHeader(hInfo, sampleNames); mHeader = new VCFHeader(hInfo, mSampleNames);
if (mFile == null) if (mFile == null)
mWriter = new VCFWriter(mHeader, stream); mWriter = new VCFWriter(mHeader, stream);
else else

View File

@ -23,7 +23,7 @@ public class VCFHeader {
private final Map<String, String> mMetaData = new HashMap<String, String>(); private final Map<String, String> mMetaData = new HashMap<String, String>();
// the list of auxillary tags // the list of auxillary tags
private final List<String> mGenotypeSampleNames = new ArrayList<String>(); private final Set<String> mGenotypeSampleNames = new HashSet<String>();
// the character string that indicates meta data // the character string that indicates meta data
public static final String METADATA_INDICATOR = "##"; public static final String METADATA_INDICATOR = "##";
@ -56,7 +56,7 @@ public class VCFHeader {
* @param metaData the meta data associated with this header * @param metaData the meta data associated with this header
* @param genotypeSampleNames the genotype format field, and the sample names * @param genotypeSampleNames the genotype format field, and the sample names
*/ */
public VCFHeader(Map<String, String> metaData, List<String> genotypeSampleNames) { public VCFHeader(Map<String, String> metaData, Set<String> genotypeSampleNames) {
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key)); for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
for (String col : genotypeSampleNames) { for (String col : genotypeSampleNames) {
if (!col.equals("FORMAT")) if (!col.equals("FORMAT"))
@ -107,7 +107,7 @@ public class VCFHeader {
* *
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/ */
public List<String> getGenotypeSamples() { public Set<String> getGenotypeSamples() {
return mGenotypeSampleNames; return mGenotypeSampleNames;
} }

View File

@ -126,7 +126,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
protected VCFHeader createHeader(List<String> headerStrings) { protected VCFHeader createHeader(List<String> headerStrings) {
Map<String, String> metaData = new HashMap<String, String>(); Map<String, String> metaData = new HashMap<String, String>();
List<String> auxTags = new ArrayList<String>(); Set<String> auxTags = new HashSet<String>();
// iterate over all the passed in strings // iterate over all the passed in strings
for (String str : headerStrings) { for (String str : headerStrings) {
Matcher matcher = pMeta.matcher(str); Matcher matcher = pMeta.matcher(str);

View File

@ -19,7 +19,7 @@ public class VCFHeaderTest extends BaseTest {
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>(); private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
private Map<String, String> metaData = new HashMap(); private Map<String, String> metaData = new HashMap();
private List<String> additionalColumns = new ArrayList<String>(); private Set<String> additionalColumns = new HashSet<String>();
/** /**
* give it fake data, and make sure we get back the right fake data * give it fake data, and make sure we get back the right fake data

View File

@ -4,10 +4,7 @@ import org.broadinstitute.sting.BaseTest;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/** /**
@ -126,7 +123,7 @@ public class VCFRecordTest extends BaseTest {
*/ */
public static VCFHeader createFakeHeader() { public static VCFHeader createFakeHeader() {
Map<String, String> metaData = new HashMap(); Map<String, String> metaData = new HashMap();
List<String> additionalColumns = new ArrayList<String>(); Set<String> additionalColumns = new HashSet<String>();
metaData.put("format", "VCRv3.2"); // required metaData.put("format", "VCRv3.2"); // required
metaData.put("two", "2"); metaData.put("two", "2");
additionalColumns.add("FORMAT"); additionalColumns.add("FORMAT");

View File

@ -18,7 +18,7 @@ import java.util.*;
public class VCFWriterTest extends BaseTest { public class VCFWriterTest extends BaseTest {
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>(); private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
private Map<String, String> metaData = new HashMap(); private Map<String, String> metaData = new HashMap();
private List<String> additionalColumns = new ArrayList<String>(); private Set<String> additionalColumns = new HashSet<String>();
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
/** test, using the writer and reader, that we can output and input a VCF file without problems */ /** test, using the writer and reader, that we can output and input a VCF file without problems */
@ -45,7 +45,7 @@ public class VCFWriterTest extends BaseTest {
* create a fake header of known quantity * create a fake header of known quantity
* @return a fake VCF header * @return a fake VCF header
*/ */
public static VCFHeader createFakeHeader(Map<String, String> metaData, List<String> additionalColumns) { public static VCFHeader createFakeHeader(Map<String, String> metaData, Set<String> additionalColumns) {
metaData.put("format", "VCRv3.2"); // required metaData.put("format", "VCRv3.2"); // required
metaData.put("two", "2"); metaData.put("two", "2");
additionalColumns.add("FORMAT"); additionalColumns.add("FORMAT");