VCF output now emits no calls as ./.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1863 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-10-16 18:51:31 +00:00
parent d1a4cd2f73
commit 8aacc43203
4 changed files with 175 additions and 59 deletions

View File

@ -90,7 +90,10 @@ public class VCFGenotypeRecord {
String str = "";
boolean first = true;
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
str += VCFGenotypeRecord.EMPTY_GENOTYPE;
else
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (first) {
switch (mPhaseType) {
case UNPHASED:

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broadinstitute.sting.utils.genotype.*;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.OutputStream;
@ -25,6 +26,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
private final File mFile;
private final OutputStream mStream;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames) {
mReferenceName = referenceName;
mSource = source;
@ -125,12 +130,32 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
VCFParameters params = new VCFParameters();
params.addFormatItem("GT");
for (Genotype gtype : genotypes) {
// setup the parameters
params.setLocations(gtype.getLocation(), gtype.getReference());
// mapping of our sample names to genotypes
if (genotypes.size() < 1) {
throw new IllegalArgumentException("Unable to parse out the current location: genotype array must at least contain one entry");
}
VCFGenotypeRecord record = createVCFGenotypeRecord(params, gtype);
params.addGenotypeRecord(record);
// get the location and reference
params.setLocations(genotypes.get(0).getLocation(), genotypes.get(0).getReference());
Map<String, Genotype> genotypeMap = genotypeListToSampleNameMap(genotypes);
for (String name : mHeader.getGenotypeSamples()) {
if (genotypeMap.containsKey(name)) {
Genotype gtype = genotypeMap.get(name);
VCFGenotypeRecord record = createVCFGenotypeRecord(params, gtype);
params.addGenotypeRecord(record);
genotypeMap.remove(name);
} else {
VCFGenotypeRecord record = createNoCallRecord(params, name);
params.addGenotypeRecord(record);
}
}
if (genotypeMap.size() > 0) {
for (String name : genotypeMap.keySet())
logger.fatal("Genotype " + name + " was present in the VCFHeader");
throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header");
}
Map<String, String> infoFields = getInfoFields(metadata, params);
@ -213,6 +238,29 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
return record;
}
/**
* create a no call record
*
* @param params the VCF parameters object
* @param sampleName the sample name
*
* @return a VCFGenotypeRecord for the no call situation
*/
private VCFGenotypeRecord createNoCallRecord(VCFParameters params, String sampleName) {
Map<String, String> map = new HashMap<String, String>();
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
VCFGenotypeRecord record = new VCFGenotypeRecord(sampleName,
alleles,
VCFGenotypeRecord.PHASE.UNPHASED,
map);
return record;
}
/**
* create the allele array?
*
@ -234,4 +282,21 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
return true;
}
/**
* create a genotype mapping from a list and their sample names
*
* @param list a list of genotype samples
*
* @return a mapping of the sample name to genotype fields
*/
private static Map<String, Genotype> genotypeListToSampleNameMap(List<Genotype> list) {
Map<String, Genotype> map = new HashMap<String, Genotype>();
for (Genotype rec : list) {
if (!(rec instanceof SampleBacked))
throw new IllegalArgumentException("Genotype must be backed by sample information");
map.put(((SampleBacked) rec).getSampleName(), rec);
}
return map;
}
}

View File

@ -25,7 +25,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
// our pattern matching for the genotype mFields
private static final Pattern gtPattern = Pattern.compile("([0-9]+)([\\\\|\\/])([0-9]*)");
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
/**
* Create a VCF reader, given a VCF file
@ -48,7 +48,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
lines.add(line);
line = mReader.readLine();
}
mHeader = this.createHeader(lines);
mHeader = this.createHeader(lines);
mNextRecord = createRecord(line, mHeader);
} catch (IOException e) {
throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e);
@ -236,8 +236,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
phase = VCFGenotypeRecord.determinePhase(m.group(2));
addAllele(m.group(1), altAlleles, referenceBase, bases);
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
}
else {
} else {
tagToValue.put(key, parse);
}
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
@ -263,14 +262,18 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
* @param bases the list of bases for this genotype call
*/
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
int alleleValue = Integer.valueOf(alleleNumber);
// check to make sure the allele value is within bounds
if (alleleValue < 0 || alleleValue > altAlleles.length)
throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
if (alleleValue == 0)
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
else
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_GENOTYPE)) {
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
} else {
int alleleValue = Integer.valueOf(alleleNumber);
// check to make sure the allele value is within bounds
if (alleleValue < 0 || alleleValue > altAlleles.length)
throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
if (alleleValue == 0)
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
else
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
}
}

View File

@ -1,91 +1,136 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.junit.Test;
import org.junit.Assert;
import org.broadinstitute.sting.BaseTest;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.List;
import java.util.Map;
/**
* test the VCFReader class test
*/
/** test the VCFReader class test */
public class VCFReaderTest extends BaseTest {
private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
private static File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf");
@Test
public void testVCFInput() {
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
VCFReader reader = new VCFReader(vcfFile);
int counter = 0;
while (reader.hasNext()) {
counter++;
reader.next();
}
Assert.assertEquals(5,counter);
Assert.assertEquals(5, counter);
}
/**
* test the basic parsing
*/
@Test
public void testMultiSampleVCFInput() {
VCFReader reader = new VCFReader(multiSampleVCF);
int counter = 0;
while (reader.hasNext()) {
counter++;
reader.next();
}
Assert.assertEquals(99, counter);
}
@Test
public void testNoCallSites() {
VCFReader reader = new VCFReader(multiSampleVCF);
if (!reader.hasNext()) Assert.fail("The reader should have a record");
VCFRecord rec = reader.next();
final int numberOfNoCallsTruth = 9;
int noCalls = 0;
for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) {
List<VCFGenotypeEncoding> encodings = record.getAlleles();
if (encodings.get(0).getType() == VCFGenotypeEncoding.TYPE.UNCALLED &&
encodings.get(1).getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
noCalls++;
}
Assert.assertEquals(numberOfNoCallsTruth, noCalls);
}
@Test
public void testKnownCallSites() {
VCFReader reader = new VCFReader(multiSampleVCF);
if (!reader.hasNext()) Assert.fail("The reader should have a record");
VCFRecord rec = reader.next();
boolean seenNA11992 = false;
boolean seenNA12287 = false;
for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) {
if (record.getSampleName().equals("NA11992")) {
List<VCFGenotypeEncoding> encodings = record.getAlleles();
if (!encodings.get(0).getBases().equals("A") ||
!encodings.get(1).getBases().equals("A")) {
Assert.fail("Sample NA11992 at site 1:10000005 should be AA");
}
seenNA11992 = true;
}
if (record.getSampleName().equals("NA12287")) {
List<VCFGenotypeEncoding> encodings = record.getAlleles();
if (!encodings.get(0).getBases().equals("A") ||
!encodings.get(1).getBases().equals("T")) {
Assert.fail("Sample NA11992 at site 1:10000005 should be AA");
}
seenNA12287 = true;
}
}
Assert.assertTrue(seenNA11992 && seenNA12287);
}
/** test the basic parsing */
@Test
public void testBasicParsing() {
String formatString = "GT:B:C:D";
String genotypeString = "0|1:2:3:4";
String altAlleles[] = {"A","G","T"};
String altAlleles[] = {"A", "G", "T"};
char referenceBase = 'C';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
Map<String,String> values = rec.getFields();
Assert.assertEquals(3,values.size());
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType());
Assert.assertEquals("C", rec.getAlleles().get(0).toString());
Assert.assertEquals("A", rec.getAlleles().get(1).toString());
Map<String, String> values = rec.getFields();
Assert.assertEquals(3, values.size());
Assert.assertTrue(values.get("B").equals("2"));
Assert.assertTrue(values.get("C").equals("3"));
Assert.assertTrue(values.get("D").equals("4"));
}
/**
* test the parsing of a genotype field with missing parameters
*/
/** test the parsing of a genotype field with missing parameters */
@Test
public void testMissingFieldParsing() {
String formatString = "GT:B:C:D";
String genotypeString = "0|1:::4";
String altAlleles[] = {"A","G","T"};
String altAlleles[] = {"A", "G", "T"};
char referenceBase = 'C';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
Map<String,String> values = rec.getFields();
Assert.assertEquals(3,values.size());
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType());
Assert.assertEquals("C", rec.getAlleles().get(0).toString());
Assert.assertEquals("A", rec.getAlleles().get(1).toString());
Map<String, String> values = rec.getFields();
Assert.assertEquals(3, values.size());
Assert.assertTrue(values.get("B").equals(""));
Assert.assertTrue(values.get("C").equals(""));
Assert.assertTrue(values.get("D").equals("4"));
}
/**
* test the parsing of a genotype field with different missing parameters
*/
/** test the parsing of a genotype field with different missing parameters */
@Test
public void testMissingAllFields() {
String formatString = "GT:B:C:D";
String genotypeString = "0|1:::";
String altAlleles[] = {"A","G","T"};
String altAlleles[] = {"A", "G", "T"};
char referenceBase = 'C';
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
Map<String,String> values = rec.getFields();
Assert.assertEquals(3,values.size());
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase);
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType());
Assert.assertEquals("C", rec.getAlleles().get(0).toString());
Assert.assertEquals("A", rec.getAlleles().get(1).toString());
Map<String, String> values = rec.getFields();
Assert.assertEquals(3, values.size());
Assert.assertTrue(values.get("B").equals(""));
Assert.assertTrue(values.get("C").equals(""));
Assert.assertTrue(values.get("D").equals(""));