2009-09-19 04:19:34 +08:00
|
|
|
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
|
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
|
|
|
import org.broadinstitute.sting.utils.StingException;
|
2009-10-16 12:11:34 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
2009-09-19 04:19:34 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.Genotype;
|
2009-10-16 12:11:34 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
2009-11-01 13:35:47 +08:00
|
|
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
2009-09-19 04:19:34 +08:00
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
|
import java.io.IOException;
|
2009-12-20 07:22:36 +08:00
|
|
|
import java.util.*;
|
2009-09-19 04:19:34 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author aaron
|
|
|
|
|
* <p/>
|
|
|
|
|
* Class RodVCF
|
|
|
|
|
* <p/>
|
|
|
|
|
* An implementation of the ROD for VCF.
|
|
|
|
|
*/
|
2009-09-24 02:24:05 +08:00
|
|
|
public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod, VariantBackedByGenotype, Iterator<RodVCF> {
|
2010-01-17 04:22:31 +08:00
|
|
|
public VCFReader getReader() {
|
|
|
|
|
return mReader;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
// our VCF related information
|
|
|
|
|
private VCFReader mReader;
|
2010-01-17 04:22:31 +08:00
|
|
|
|
|
|
|
|
public VCFRecord getRecord() {
|
|
|
|
|
return mCurrentRecord;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
public VCFRecord mCurrentRecord;
|
|
|
|
|
|
|
|
|
|
public RodVCF(String name) {
|
|
|
|
|
super(name);
|
|
|
|
|
}
|
|
|
|
|
|
2010-01-17 04:22:31 +08:00
|
|
|
public RodVCF(String name, VCFRecord currentRecord, VCFReader reader) {
|
2009-09-30 05:28:21 +08:00
|
|
|
super(name);
|
|
|
|
|
mCurrentRecord = currentRecord;
|
|
|
|
|
mReader = reader;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
public void assertNotNull() {
|
2009-12-06 14:48:03 +08:00
|
|
|
if ( mCurrentRecord == null ) {
|
2009-09-19 04:19:34 +08:00
|
|
|
throw new UnsupportedOperationException("The current Record is null");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public boolean parseLine(Object header, String[] parts) throws IOException {
|
2009-12-06 14:48:03 +08:00
|
|
|
throw new UnsupportedOperationException("RodVCF does not support the parseLine method");
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Object initialize(final File source) throws FileNotFoundException {
|
2009-12-06 14:48:03 +08:00
|
|
|
if ( mReader == null )
|
|
|
|
|
mReader = new VCFReader(source);
|
2009-09-19 04:19:34 +08:00
|
|
|
return mReader.getHeader();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
2009-12-06 14:48:03 +08:00
|
|
|
return (mCurrentRecord != null ? mCurrentRecord.toStringEncoding(mReader.getHeader()) : "");
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-30 05:28:21 +08:00
|
|
|
public static RodVCF createIterator(String name, File file) {
|
2009-09-19 04:19:34 +08:00
|
|
|
RodVCF vcf = new RodVCF(name);
|
|
|
|
|
try {
|
|
|
|
|
vcf.initialize(file);
|
|
|
|
|
} catch (FileNotFoundException e) {
|
|
|
|
|
throw new StingException("Unable to find file " + file);
|
|
|
|
|
}
|
|
|
|
|
return vcf;
|
|
|
|
|
}
|
|
|
|
|
|
2009-11-09 12:20:35 +08:00
|
|
|
public boolean hasNonRefAlleleFrequency() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getNonRefAlleleFrequency() > 0.0;
|
2009-11-09 12:20:35 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
/**
|
|
|
|
|
* get the frequency of this variant
|
|
|
|
|
*
|
|
|
|
|
* @return VariantFrequency with the stored frequency
|
|
|
|
|
*/
|
|
|
|
|
public double getNonRefAlleleFrequency() {
|
|
|
|
|
assertNotNull();
|
2009-12-06 14:48:03 +08:00
|
|
|
return mCurrentRecord.getNonRefAlleleFrequency();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-11-09 12:20:35 +08:00
|
|
|
public boolean hasStrandBias() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
2009-12-09 05:43:28 +08:00
|
|
|
return this.mCurrentRecord.getInfoValues().containsKey(VCFRecord.STRAND_BIAS_KEY);
|
2009-11-09 12:20:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* get the strand bias of this variant
|
|
|
|
|
*
|
|
|
|
|
* @return StrandBias with the stored slod
|
|
|
|
|
*/
|
|
|
|
|
public double getStrandBias() {
|
2009-12-09 05:43:28 +08:00
|
|
|
return hasStrandBias() ? Double.valueOf(this.mCurrentRecord.getInfoValues().get(VCFRecord.STRAND_BIAS_KEY)) : 0.0;
|
2009-11-09 12:20:35 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
/** @return the VARIANT_TYPE of the current variant */
|
|
|
|
|
public VARIANT_TYPE getType() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getType();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-17 01:58:41 +08:00
|
|
|
public String getID() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getID();
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
/**
|
|
|
|
|
* are we a SNP? If not we're a Indel/deletion
|
|
|
|
|
*
|
|
|
|
|
* @return true if we're a SNP
|
|
|
|
|
*/
|
|
|
|
|
public boolean isSNP() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isSNP();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* are we an insertion?
|
|
|
|
|
*
|
|
|
|
|
* @return true if we are, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
public boolean isInsertion() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isInsertion();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* are we an insertion?
|
|
|
|
|
*
|
|
|
|
|
* @return true if we are, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
public boolean isDeletion() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isDeletion();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public GenomeLoc getLocation() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getLocation();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* get the reference base(s) at this position
|
|
|
|
|
*
|
|
|
|
|
* @return the reference base or bases, as a string
|
|
|
|
|
*/
|
|
|
|
|
public String getReference() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getReference();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** are we bi-allelic? */
|
|
|
|
|
public boolean isBiallelic() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isBiallelic();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* get the -1 * (log 10 of the error value)
|
|
|
|
|
*
|
|
|
|
|
* @return the log based error estimate
|
|
|
|
|
*/
|
|
|
|
|
public double getNegLog10PError() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getNegLog10PError();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-17 01:58:41 +08:00
|
|
|
public double getQual() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getQual();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean hasAlternateAllele() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.hasAlternateAllele();
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
/**
|
2009-10-23 14:31:15 +08:00
|
|
|
* gets the alternate alleles. This method should return all the alleles present at the location,
|
|
|
|
|
* NOT including the reference base. This is returned as a string list with no guarantee ordering
|
|
|
|
|
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
|
|
|
|
|
* frequency).
|
2009-09-19 04:19:34 +08:00
|
|
|
*
|
2009-10-23 14:31:15 +08:00
|
|
|
* @return an alternate allele list
|
2009-09-19 04:19:34 +08:00
|
|
|
*/
|
2009-10-23 14:31:15 +08:00
|
|
|
public List<String> getAlternateAlleleList() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getAlternateAlleleList();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2009-10-23 14:31:15 +08:00
|
|
|
* gets the alleles. This method should return all the alleles present at the location,
|
|
|
|
|
* including the reference base. The first allele should always be the reference allele, followed
|
|
|
|
|
* by an unordered list of alternate alleles.
|
2009-09-19 04:19:34 +08:00
|
|
|
*
|
2009-10-23 14:31:15 +08:00
|
|
|
* @return an alternate allele list
|
2009-09-19 04:19:34 +08:00
|
|
|
*/
|
2009-10-23 14:31:15 +08:00
|
|
|
public List<String> getAlleleList() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getAlleleList();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2009-10-23 14:31:15 +08:00
|
|
|
* are we truely a variant, given a reference
|
2009-09-19 04:19:34 +08:00
|
|
|
*
|
2009-10-23 14:31:15 +08:00
|
|
|
* @return false if we're a variant(indel, delete, SNP, etc), true if we're not
|
2009-09-19 04:19:34 +08:00
|
|
|
*/
|
2009-10-23 14:31:15 +08:00
|
|
|
public boolean isReference() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isReference();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* are we an insertion or a deletion? yes, then return true. No? Well, false then.
|
|
|
|
|
*
|
|
|
|
|
* @return true if we're an insertion or deletion
|
|
|
|
|
*/
|
|
|
|
|
public boolean isIndel() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isIndel();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* gets the alternate base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
|
|
|
|
|
* of
|
|
|
|
|
*
|
|
|
|
|
* @return a char, representing the alternate base
|
|
|
|
|
*/
|
|
|
|
|
public char getAlternativeBaseForSNP() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getAlternativeBaseForSNP();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* gets the reference base is the case of a SNP. Throws an IllegalStateException if we're not a SNP
|
|
|
|
|
*
|
|
|
|
|
* @return a char, representing the alternate base
|
|
|
|
|
*/
|
|
|
|
|
public char getReferenceForSNP() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getReferenceForSNP();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-17 01:58:41 +08:00
|
|
|
public boolean hasGenotypeData() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.hasGenotypeData();
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 06:25:16 +08:00
|
|
|
/**
|
|
|
|
|
* get the genotype
|
|
|
|
|
*
|
2009-12-20 07:22:36 +08:00
|
|
|
* // todo -- WTF is this? This is a deeply unsafe call
|
|
|
|
|
*
|
2009-09-19 06:25:16 +08:00
|
|
|
* @return a map in lexigraphical order of the genotypes
|
|
|
|
|
*/
|
2009-09-19 06:38:51 +08:00
|
|
|
public Genotype getCalledGenotype() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getCalledGenotype();
|
2009-09-19 06:25:16 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
/**
|
|
|
|
|
* get the genotypes
|
|
|
|
|
*
|
|
|
|
|
* @return a list of the genotypes
|
|
|
|
|
*/
|
|
|
|
|
public List<Genotype> getGenotypes() {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getGenotypes();
|
2009-11-15 17:28:25 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-06 19:43:40 +08:00
|
|
|
/**
|
|
|
|
|
* get the genotypes
|
|
|
|
|
*
|
|
|
|
|
* @return a list of the genotypes
|
|
|
|
|
*/
|
|
|
|
|
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getVCFGenotypeRecords();
|
|
|
|
|
}
|
|
|
|
|
|
2010-01-06 23:03:07 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the genotype associated with sample, or null if the genotype is missing
|
|
|
|
|
*
|
|
|
|
|
* @param sampleName the name of the sample genotype to fetch
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public Genotype getGenotype(final String sampleName) {
|
|
|
|
|
return mCurrentRecord.getGenotype(sampleName);
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-19 04:19:34 +08:00
|
|
|
/**
|
|
|
|
|
* do we have the specified genotype? not all backedByGenotypes
|
|
|
|
|
* have all the genotype data.
|
|
|
|
|
*
|
|
|
|
|
* @param x the genotype
|
|
|
|
|
*
|
|
|
|
|
* @return true if available, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
public boolean hasGenotype(DiploidGenotype x) {
|
2009-12-06 14:48:03 +08:00
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.hasGenotype(x);
|
|
|
|
|
}
|
|
|
|
|
|
2009-12-17 01:58:41 +08:00
|
|
|
public String[] getSampleNames() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getSampleNames();
|
|
|
|
|
}
|
|
|
|
|
|
2009-12-20 07:22:36 +08:00
|
|
|
// public Map<String, Genotype> getSampleGenotypes() {
|
|
|
|
|
// String[] samples = getSampleNames();
|
|
|
|
|
// List<Genotype> genotypes = getGenotypes();
|
|
|
|
|
// HashMap<String, Genotype> map = new HashMap<String, Genotype>();
|
|
|
|
|
//
|
|
|
|
|
// for ( int i = 0; i < samples.length; i++ ) {
|
|
|
|
|
// map.put(samples[i], genotypes.get(i));
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// return map;
|
|
|
|
|
// }
|
|
|
|
|
|
2009-12-17 01:58:41 +08:00
|
|
|
public Map<String, String> getInfoValues() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getInfoValues();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String[] getFilteringCodes() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getFilteringCodes();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isFiltered() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.isFiltered();
|
|
|
|
|
}
|
|
|
|
|
|
2010-01-07 22:50:13 +08:00
|
|
|
// public boolean hasFilteringCodes() {
|
|
|
|
|
// assertNotNull();
|
|
|
|
|
// return mCurrentRecord.hasFilteringCodes();
|
|
|
|
|
// }
|
2009-12-17 01:58:41 +08:00
|
|
|
|
|
|
|
|
public String getFilterString() {
|
|
|
|
|
assertNotNull();
|
|
|
|
|
return mCurrentRecord.getFilterString();
|
|
|
|
|
}
|
|
|
|
|
|
2009-12-06 14:48:03 +08:00
|
|
|
public VCFHeader getHeader() {
|
|
|
|
|
return mReader.getHeader();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean hasNext() {
|
2009-12-06 14:48:03 +08:00
|
|
|
return mReader.hasNext();
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-16 07:04:40 +08:00
|
|
|
/**
|
|
|
|
|
* @return the next element in the iteration.
|
|
|
|
|
* @throws NoSuchElementException - iterator has no more elements.
|
|
|
|
|
*/
|
2009-09-19 04:19:34 +08:00
|
|
|
public RodVCF next() {
|
2009-12-16 07:04:40 +08:00
|
|
|
if (!this.hasNext()) throw new NoSuchElementException("RodVCF next called on iterator with no more elements");
|
2009-12-18 23:24:45 +08:00
|
|
|
|
|
|
|
|
// get the next record
|
|
|
|
|
VCFRecord rec = mReader.next();
|
|
|
|
|
|
|
|
|
|
// make sure the next VCF record isn't before the current record (we'll accept at the same location, the
|
|
|
|
|
// spec doesn't indicate, and it seems like a valid use case)
|
|
|
|
|
GenomeLoc curPosition = null;
|
|
|
|
|
if (mCurrentRecord != null) curPosition = mCurrentRecord.getLocation();
|
|
|
|
|
if (curPosition != null && rec != null && curPosition.compareTo(rec.getLocation()) > 0)
|
|
|
|
|
throw new StingException("The next VCF record appears to be before the current (current location => " + curPosition.toString() +
|
|
|
|
|
", the next record position => " + rec.getLocation().toString() + " with line : " + rec.toStringEncoding(mReader.getHeader()) + "). " +
|
|
|
|
|
"Check to make sure the input VCF file is correctly sorted.");
|
|
|
|
|
|
|
|
|
|
// save off the previous record. This is needed given how iterators are used in the ROD system;
|
|
|
|
|
// we need to save off the last record
|
|
|
|
|
mCurrentRecord = rec;
|
|
|
|
|
return new RodVCF(name, rec, mReader);
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void remove() {
|
2009-12-06 14:48:03 +08:00
|
|
|
throw new UnsupportedOperationException("The remove operation is not supported for a VCF rod");
|
2009-09-19 04:19:34 +08:00
|
|
|
}
|
|
|
|
|
}
|