parent
085588cb04
commit
0799855479
|
|
@ -1,253 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.gcf;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* GATK binary VCF record
|
|
||||||
*
|
|
||||||
* @author Your Name
|
|
||||||
* @since Date created
|
|
||||||
*/
|
|
||||||
public class GCF {
|
|
||||||
private final static int RECORD_TERMINATOR = 123456789;
|
|
||||||
private int chromOffset;
|
|
||||||
private int start, stop;
|
|
||||||
private String id;
|
|
||||||
private List<Allele> alleleMap;
|
|
||||||
private int alleleOffsets[];
|
|
||||||
private float qual;
|
|
||||||
private byte refPad;
|
|
||||||
private String info;
|
|
||||||
private int filterOffset;
|
|
||||||
|
|
||||||
private List<GCFGenotype> genotypes = Collections.emptyList();
|
|
||||||
|
|
||||||
public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) {
|
|
||||||
chromOffset = GCFHeaderBuilder.encodeString(vc.getChr());
|
|
||||||
start = vc.getStart();
|
|
||||||
stop = vc.getEnd();
|
|
||||||
refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0;
|
|
||||||
id = vc.getID();
|
|
||||||
|
|
||||||
// encode alleles
|
|
||||||
alleleMap = new ArrayList<Allele>(vc.getNAlleles());
|
|
||||||
alleleOffsets = new int[vc.getNAlleles()];
|
|
||||||
alleleMap.add(vc.getReference());
|
|
||||||
alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference());
|
|
||||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
|
||||||
alleleMap.add(vc.getAlternateAllele(i));
|
|
||||||
alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
qual = (float)vc.getLog10PError(); //qualToByte(vc.getPhredScaledQual());
|
|
||||||
info = infoFieldString(vc, GCFHeaderBuilder);
|
|
||||||
filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc));
|
|
||||||
|
|
||||||
if ( ! skipGenotypes ) {
|
|
||||||
genotypes = encodeGenotypes(GCFHeaderBuilder, vc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
|
|
||||||
chromOffset = inputStream.readInt();
|
|
||||||
|
|
||||||
// have we reached the footer?
|
|
||||||
if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
|
|
||||||
throw new EOFException();
|
|
||||||
|
|
||||||
start = inputStream.readInt();
|
|
||||||
stop = inputStream.readInt();
|
|
||||||
id = inputStream.readUTF();
|
|
||||||
refPad = inputStream.readByte();
|
|
||||||
alleleOffsets = readIntArray(inputStream);
|
|
||||||
qual = inputStream.readFloat();
|
|
||||||
info = inputStream.readUTF();
|
|
||||||
filterOffset = inputStream.readInt();
|
|
||||||
|
|
||||||
int nGenotypes = inputStream.readInt();
|
|
||||||
int sizeOfGenotypes = inputStream.readInt();
|
|
||||||
if ( skipGenotypes ) {
|
|
||||||
genotypes = Collections.emptyList();
|
|
||||||
inputStream.skipBytes(sizeOfGenotypes);
|
|
||||||
} else {
|
|
||||||
genotypes = new ArrayList<GCFGenotype>(nGenotypes);
|
|
||||||
for ( int i = 0; i < nGenotypes; i++ )
|
|
||||||
genotypes.add(new GCFGenotype(this, inputStream));
|
|
||||||
}
|
|
||||||
|
|
||||||
int recordDone = inputStream.readInt();
|
|
||||||
if ( recordDone != RECORD_TERMINATOR )
|
|
||||||
throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
|
|
||||||
}
|
|
||||||
|
|
||||||
public int write(DataOutputStream outputStream) throws IOException {
|
|
||||||
int startSize = outputStream.size();
|
|
||||||
outputStream.writeInt(chromOffset);
|
|
||||||
outputStream.writeInt(start);
|
|
||||||
outputStream.writeInt(stop);
|
|
||||||
outputStream.writeUTF(id);
|
|
||||||
outputStream.writeByte(refPad);
|
|
||||||
writeIntArray(alleleOffsets, outputStream, true);
|
|
||||||
outputStream.writeFloat(qual);
|
|
||||||
outputStream.writeUTF(info);
|
|
||||||
outputStream.writeInt(filterOffset);
|
|
||||||
|
|
||||||
int nGenotypes = genotypes.size();
|
|
||||||
int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
|
|
||||||
outputStream.writeInt(nGenotypes);
|
|
||||||
outputStream.writeInt(expectedSizeOfGenotypes);
|
|
||||||
int obsSizeOfGenotypes = 0;
|
|
||||||
for ( GCFGenotype g : genotypes )
|
|
||||||
obsSizeOfGenotypes += g.write(outputStream);
|
|
||||||
if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
|
|
||||||
throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
|
|
||||||
|
|
||||||
outputStream.writeInt(RECORD_TERMINATOR);
|
|
||||||
return outputStream.size() - startSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VariantContext decode(final String source, final GCFHeader header) {
|
|
||||||
final String contig = header.getString(chromOffset);
|
|
||||||
alleleMap = header.getAlleles(alleleOffsets);
|
|
||||||
|
|
||||||
VariantContextBuilder builder = new VariantContextBuilder(source, contig, start, stop, alleleMap);
|
|
||||||
builder.genotypes(decodeGenotypes(header));
|
|
||||||
builder.log10PError(qual);
|
|
||||||
builder.filters(header.getFilters(filterOffset));
|
|
||||||
builder.attribute("INFO", info);
|
|
||||||
builder.referenceBaseForIndel(refPad == 0 ? null : refPad);
|
|
||||||
return builder.make();
|
|
||||||
}
|
|
||||||
|
|
||||||
private GenotypesContext decodeGenotypes(final GCFHeader header) {
|
|
||||||
if ( genotypes.isEmpty() )
|
|
||||||
return VariantContext.NO_GENOTYPES;
|
|
||||||
else {
|
|
||||||
GenotypesContext map = GenotypesContext.create(genotypes.size());
|
|
||||||
|
|
||||||
for ( int i = 0; i < genotypes.size(); i++ ) {
|
|
||||||
final String sampleName = header.getSample(i);
|
|
||||||
final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap);
|
|
||||||
map.add(g);
|
|
||||||
}
|
|
||||||
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<GCFGenotype> encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) {
|
|
||||||
int nGenotypes = vc.getNSamples();
|
|
||||||
if ( nGenotypes > 0 ) {
|
|
||||||
List<GCFGenotype> genotypes = new ArrayList<GCFGenotype>(nGenotypes);
|
|
||||||
for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null);
|
|
||||||
|
|
||||||
for ( Genotype g : vc.getGenotypes() ) {
|
|
||||||
int i = GCFHeaderBuilder.encodeSample(g.getSampleName());
|
|
||||||
genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g));
|
|
||||||
}
|
|
||||||
|
|
||||||
return genotypes;
|
|
||||||
} else {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNAlleles() { return alleleOffsets.length; }
|
|
||||||
|
|
||||||
|
|
||||||
private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
|
|
||||||
StringBuilder s = new StringBuilder();
|
|
||||||
|
|
||||||
boolean first = true;
|
|
||||||
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
|
|
||||||
String key = field.getKey();
|
|
||||||
int stringIndex = GCFHeaderBuilder.encodeString(key);
|
|
||||||
String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
|
|
||||||
if ( outputValue != null ) {
|
|
||||||
if ( ! first ) s.append(";");
|
|
||||||
s.append(stringIndex).append("=").append(outputValue);
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return s.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static int BUFFER_SIZE = 1048576; // 2**20
|
|
||||||
|
|
||||||
public static DataInputStream createDataInputStream(final InputStream stream) {
|
|
||||||
return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
|
|
||||||
return new FileInputStream(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
|
|
||||||
return readIntArray(inputStream, inputStream.readInt());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException {
|
|
||||||
int[] array = new int[size];
|
|
||||||
for ( int i = 0; i < array.length; i++ )
|
|
||||||
array[i] = inputStream.readInt();
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
|
|
||||||
if ( writeSize ) outputStream.writeInt(array.length);
|
|
||||||
for ( int i : array )
|
|
||||||
outputStream.writeInt(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException {
|
|
||||||
return readByteArray(inputStream, inputStream.readInt());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException {
|
|
||||||
byte[] array = new byte[size];
|
|
||||||
for ( int i = 0; i < array.length; i++ )
|
|
||||||
array[i] = inputStream.readByte();
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
|
|
||||||
if ( writeSize ) outputStream.writeInt(array.length);
|
|
||||||
for ( byte i : array )
|
|
||||||
outputStream.writeByte(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected final static byte qualToByte(double phredScaledQual) {
|
|
||||||
return (byte)Math.round(Math.min(phredScaledQual, 255));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,147 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.gcf;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|
||||||
|
|
||||||
import java.io.DataInputStream;
|
|
||||||
import java.io.DataOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* GATK binary VCF record
|
|
||||||
*
|
|
||||||
* @author Your Name
|
|
||||||
* @since Date created
|
|
||||||
*/
|
|
||||||
public class GCFGenotype {
|
|
||||||
private byte gq;
|
|
||||||
private int gt;
|
|
||||||
private int dp;
|
|
||||||
private int ad[];
|
|
||||||
private byte[] pl;
|
|
||||||
|
|
||||||
// todo -- what to do about phasing? Perhaps we shouldn't support it
|
|
||||||
// todo -- is the FL field generic or just a flag? Should we even support per sample filtering?
|
|
||||||
|
|
||||||
public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List<Allele> allAlleles, Genotype genotype) {
|
|
||||||
gq = GCF.qualToByte(genotype.getPhredScaledQual());
|
|
||||||
gt = encodeAlleles(genotype.getAlleles(), allAlleles);
|
|
||||||
|
|
||||||
dp = genotype.getAttributeAsInt("DP", 0);
|
|
||||||
|
|
||||||
int nAlleles = allAlleles.size();
|
|
||||||
ad = new int[nAlleles];
|
|
||||||
|
|
||||||
int npls = nAllelesToNPls(nAlleles);
|
|
||||||
pl = new byte[npls];
|
|
||||||
}
|
|
||||||
|
|
||||||
private int nAllelesToNPls( int nAlleles ) {
|
|
||||||
return nAlleles*(nAlleles+1) / 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException {
|
|
||||||
int gqInt = inputStream.readUnsignedByte();
|
|
||||||
gq = (byte)gqInt;
|
|
||||||
gt = inputStream.readInt();
|
|
||||||
dp = inputStream.readInt();
|
|
||||||
ad = GCF.readIntArray(inputStream, GCF.getNAlleles());
|
|
||||||
pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2 alleles => 1 + 8 + 8 + 3 => 20
|
|
||||||
protected int sizeInBytes() {
|
|
||||||
return 1 // gq
|
|
||||||
+ 4 * 2 // gt + dp
|
|
||||||
+ 4 * ad.length // ad
|
|
||||||
+ 1 * pl.length; // pl
|
|
||||||
}
|
|
||||||
|
|
||||||
public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List<Allele> alleleIndex) {
|
|
||||||
final List<Allele> alleles = decodeAlleles(gt, alleleIndex);
|
|
||||||
final double log10PError = gq / -10.0;
|
|
||||||
final Set<String> filters = Collections.emptySet();
|
|
||||||
final Map<String, Object> attributes = new HashMap<String, Object>();
|
|
||||||
attributes.put("DP", dp);
|
|
||||||
attributes.put("AD", ad);
|
|
||||||
attributes.put("PL", pl);
|
|
||||||
|
|
||||||
return new Genotype(sampleName, alleles, log10PError, filters, attributes, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int encodeAlleles(List<Allele> gtList, List<Allele> allAlleles) {
|
|
||||||
final int nAlleles = gtList.size();
|
|
||||||
if ( nAlleles > 4 )
|
|
||||||
throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList);
|
|
||||||
|
|
||||||
int gtInt = 0;
|
|
||||||
for ( int i = 0; i < nAlleles ; i++ ) {
|
|
||||||
final int bitOffset = i * 8;
|
|
||||||
final int allelei = getAlleleIndex(gtList.get(i), allAlleles);
|
|
||||||
final int gti = (allelei + 1) << bitOffset;
|
|
||||||
gtInt = gtInt | gti;
|
|
||||||
}
|
|
||||||
|
|
||||||
return gtInt;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int getAlleleIndex(Allele q, List<Allele> allAlleles) {
|
|
||||||
if ( q.isNoCall() )
|
|
||||||
return 254;
|
|
||||||
for ( int i = 0; i < allAlleles.size(); i++ )
|
|
||||||
if ( q.equals(allAlleles.get(i)) )
|
|
||||||
return i;
|
|
||||||
throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<Allele> decodeAlleles(int gtInt, List<Allele> alleleIndex) {
|
|
||||||
List<Allele> alleles = new ArrayList<Allele>(4);
|
|
||||||
|
|
||||||
for ( int i = 0; i < 32; i += 8 ) {
|
|
||||||
final int gi = (gtInt & (0x000000FF << i)) >> i;
|
|
||||||
if ( gi != 0 ) {
|
|
||||||
final int allelei = gi - 1;
|
|
||||||
alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) );
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return alleles;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int write(DataOutputStream outputStream) throws IOException {
|
|
||||||
int startSize = outputStream.size();
|
|
||||||
outputStream.writeByte(gq);
|
|
||||||
outputStream.writeInt(gt);
|
|
||||||
outputStream.writeInt(dp);
|
|
||||||
GCF.writeIntArray(ad, outputStream, false);
|
|
||||||
GCF.writeByteArray(pl, outputStream, false);
|
|
||||||
return outputStream.size() - startSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,205 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.gcf;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* [Short one sentence description of this walker]
|
|
||||||
* <p/>
|
|
||||||
* <p>
|
|
||||||
* [Functionality of this walker]
|
|
||||||
* </p>
|
|
||||||
* <p/>
|
|
||||||
* <h2>Input</h2>
|
|
||||||
* <p>
|
|
||||||
* [Input description]
|
|
||||||
* </p>
|
|
||||||
* <p/>
|
|
||||||
* <h2>Output</h2>
|
|
||||||
* <p>
|
|
||||||
* [Output description]
|
|
||||||
* </p>
|
|
||||||
* <p/>
|
|
||||||
* <h2>Examples</h2>
|
|
||||||
* <pre>
|
|
||||||
* java
|
|
||||||
* -jar GenomeAnalysisTK.jar
|
|
||||||
* -T $WalkerName
|
|
||||||
* </pre>
|
|
||||||
*
|
|
||||||
* @author Your Name
|
|
||||||
* @since Date created
|
|
||||||
*/
|
|
||||||
public class GCFHeader {
|
|
||||||
final protected static Logger logger = Logger.getLogger(GCFHeader.class);
|
|
||||||
|
|
||||||
public final static int GCF_VERSION = 1;
|
|
||||||
public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes();
|
|
||||||
public final static int FOOTER_START_MARKER = -1;
|
|
||||||
public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version
|
|
||||||
|
|
||||||
final int version;
|
|
||||||
long footerPosition;
|
|
||||||
final List<Allele> alleles;
|
|
||||||
final List<String> strings;
|
|
||||||
final List<String> samples;
|
|
||||||
final List<Set<String>> filters;
|
|
||||||
|
|
||||||
public GCFHeader(final Map<Allele, Integer> allelesIn, final Map<String, Integer> stringIn, final Map<String, Integer> samplesIn) {
|
|
||||||
version = GCF_VERSION;
|
|
||||||
footerPosition = 0;
|
|
||||||
this.alleles = linearize(allelesIn);
|
|
||||||
this.strings = linearize(stringIn);
|
|
||||||
this.samples = linearize(samplesIn);
|
|
||||||
this.filters = null; // not used with this constructor
|
|
||||||
}
|
|
||||||
|
|
||||||
public GCFHeader(FileInputStream fileInputStream) throws IOException {
|
|
||||||
DataInputStream inputStream = new DataInputStream(fileInputStream);
|
|
||||||
byte[] headerTest = new byte[GCF_FILE_START_MARKER.length];
|
|
||||||
inputStream.read(headerTest);
|
|
||||||
if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) {
|
|
||||||
throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest));
|
|
||||||
} else {
|
|
||||||
version = inputStream.readInt();
|
|
||||||
logger.info("Read GCF version " + version);
|
|
||||||
footerPosition = inputStream.readLong();
|
|
||||||
logger.info("Read footer position of " + footerPosition);
|
|
||||||
long lastPos = fileInputStream.getChannel().position();
|
|
||||||
logger.info(" Last position is " + lastPos);
|
|
||||||
|
|
||||||
// seek to the footer
|
|
||||||
fileInputStream.getChannel().position(footerPosition);
|
|
||||||
if ( inputStream.readInt() != FOOTER_START_MARKER )
|
|
||||||
throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker");
|
|
||||||
alleles = stringsToAlleles(readStrings(inputStream));
|
|
||||||
strings = readStrings(inputStream);
|
|
||||||
samples = readStrings(inputStream);
|
|
||||||
logger.info(String.format("Allele map of %d elements", alleles.size()));
|
|
||||||
logger.info(String.format("String map of %d elements", strings.size()));
|
|
||||||
logger.info(String.format("Sample map of %d elements", samples.size()));
|
|
||||||
filters = initializeFilterCache();
|
|
||||||
fileInputStream.getChannel().position(lastPos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int writeHeader(final DataOutputStream outputStream) throws IOException {
|
|
||||||
int startBytes = outputStream.size();
|
|
||||||
outputStream.write(GCF_FILE_START_MARKER);
|
|
||||||
outputStream.writeInt(GCF_VERSION);
|
|
||||||
outputStream.writeLong(0);
|
|
||||||
return outputStream.size() - startBytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int writeFooter(final DataOutputStream outputStream) throws IOException {
|
|
||||||
int startBytes = outputStream.size();
|
|
||||||
outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding
|
|
||||||
write(outputStream, allelesToStrings(alleles));
|
|
||||||
write(outputStream, strings);
|
|
||||||
write(outputStream, samples);
|
|
||||||
return outputStream.size() - startBytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void write(DataOutputStream outputStream, List<String> l) throws IOException {
|
|
||||||
outputStream.writeInt(l.size());
|
|
||||||
for ( String elt : l ) outputStream.writeUTF(elt);
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> allelesToStrings(List<Allele> alleles) {
|
|
||||||
List<String> strings = new ArrayList<String>(alleles.size());
|
|
||||||
for ( Allele allele : alleles ) strings.add(allele.toString());
|
|
||||||
return strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<Set<String>> initializeFilterCache() {
|
|
||||||
// required to allow offset -> set lookup
|
|
||||||
List<Set<String>> l = new ArrayList<Set<String>>(strings.size());
|
|
||||||
for ( int i = 0; i < strings.size(); i++ ) l.add(null);
|
|
||||||
return l;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<Allele> stringsToAlleles(final List<String> strings) {
|
|
||||||
final List<Allele> alleles = new ArrayList<Allele>(strings.size());
|
|
||||||
for ( String string : strings ) {
|
|
||||||
boolean isRef = string.endsWith("*");
|
|
||||||
if ( isRef ) string = string.substring(0, string.length() - 1);
|
|
||||||
alleles.add(Allele.create(string, isRef));
|
|
||||||
}
|
|
||||||
return alleles;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<String> readStrings(final DataInputStream inputStream) throws IOException {
|
|
||||||
final int nStrings = inputStream.readInt();
|
|
||||||
|
|
||||||
final List<String> strings = new ArrayList<String>(nStrings);
|
|
||||||
for ( int i = 0; i < nStrings; i++ ) {
|
|
||||||
strings.add(inputStream.readUTF());
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T> List<T> linearize(final Map<T, Integer> map) {
|
|
||||||
final ArrayList<T> l = new ArrayList<T>(map.size());
|
|
||||||
for ( int i = 0; i < map.size(); i++ ) l.add(null);
|
|
||||||
for ( final Map.Entry<T, Integer> elt : map.entrySet() )
|
|
||||||
l.set(elt.getValue(), elt.getKey());
|
|
||||||
return l;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSample(final int offset) { return samples.get(offset); }
|
|
||||||
public String getString(final int offset) { return strings.get(offset); }
|
|
||||||
public Allele getAllele(final int offset) { return alleles.get(offset); }
|
|
||||||
public List<Allele> getAlleles(final int[] offsets) {
|
|
||||||
final List<Allele> alleles = new ArrayList<Allele>(offsets.length);
|
|
||||||
for ( int i : offsets ) alleles.add(getAllele(i));
|
|
||||||
return alleles;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Set<String> getFilters(final int offset) {
|
|
||||||
Set<String> cached = filters.get(offset);
|
|
||||||
|
|
||||||
if ( cached != null )
|
|
||||||
return cached;
|
|
||||||
else {
|
|
||||||
final String filterString = getString(offset);
|
|
||||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
|
||||||
return null; // UNFILTERED records are represented by null
|
|
||||||
else {
|
|
||||||
Set<String> set = VCFCodec.parseFilters(null, -1, filterString);
|
|
||||||
filters.set(offset, set); // remember the result
|
|
||||||
return set;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,80 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.gcf;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* [Short one sentence description of this walker]
|
|
||||||
* <p/>
|
|
||||||
* <p>
|
|
||||||
* [Functionality of this walker]
|
|
||||||
* </p>
|
|
||||||
* <p/>
|
|
||||||
* <h2>Input</h2>
|
|
||||||
* <p>
|
|
||||||
* [Input description]
|
|
||||||
* </p>
|
|
||||||
* <p/>
|
|
||||||
* <h2>Output</h2>
|
|
||||||
* <p>
|
|
||||||
* [Output description]
|
|
||||||
* </p>
|
|
||||||
* <p/>
|
|
||||||
* <h2>Examples</h2>
|
|
||||||
* <pre>
|
|
||||||
* java
|
|
||||||
* -jar GenomeAnalysisTK.jar
|
|
||||||
* -T $WalkerName
|
|
||||||
* </pre>
|
|
||||||
*
|
|
||||||
* @author Your Name
|
|
||||||
* @since Date created
|
|
||||||
*/
|
|
||||||
public class GCFHeaderBuilder {
|
|
||||||
Map<Allele, Integer> alleles = new HashMap<Allele, Integer>();
|
|
||||||
Map<String, Integer> strings = new HashMap<String, Integer>();
|
|
||||||
Map<String, Integer> samples = new HashMap<String, Integer>();
|
|
||||||
|
|
||||||
public GCFHeader createHeader() {
|
|
||||||
return new GCFHeader(alleles, strings, samples);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int encodeString(final String chr) { return encode(strings, chr); }
|
|
||||||
public int encodeAllele(final Allele allele) { return encode(alleles, allele); }
|
|
||||||
public int encodeSample(final String sampleName) { return encode(samples, sampleName); }
|
|
||||||
|
|
||||||
private <T> int encode(Map<T, Integer> map, T key) {
|
|
||||||
Integer v = map.get(key);
|
|
||||||
if ( v == null ) {
|
|
||||||
v = map.size();
|
|
||||||
map.put(key, v);
|
|
||||||
}
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,123 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2011, The Broad Institute
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.gcf;
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
|
|
||||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* GCFWriter implementing the VCFWriter interface
|
|
||||||
* @author Your Name
|
|
||||||
* @since Date created
|
|
||||||
*/
|
|
||||||
public class GCFWriter extends IndexingVCFWriter {
|
|
||||||
final boolean skipGenotypes;
|
|
||||||
final FileOutputStream fileOutputStream;
|
|
||||||
final DataOutputStream dataOutputStream;
|
|
||||||
final GCFHeaderBuilder gcfHeaderBuilder;
|
|
||||||
int nbytes = 0;
|
|
||||||
VCFHeader header = null;
|
|
||||||
File location;
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// Constructors
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
|
||||||
super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing);
|
|
||||||
this.location = location;
|
|
||||||
this.skipGenotypes = doNotWriteGenotypes;
|
|
||||||
|
|
||||||
// write the output
|
|
||||||
try {
|
|
||||||
fileOutputStream = new FileOutputStream(location);
|
|
||||||
dataOutputStream = createDataOutputStream(fileOutputStream);
|
|
||||||
gcfHeaderBuilder = new GCFHeaderBuilder();
|
|
||||||
} catch ( FileNotFoundException e ) {
|
|
||||||
throw new UserException.CouldNotCreateOutputFile(location, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// VCFWriter interface functions
|
|
||||||
//
|
|
||||||
// --------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void writeHeader(VCFHeader header) {
|
|
||||||
this.header = header;
|
|
||||||
try {
|
|
||||||
nbytes += GCFHeader.writeHeader(dataOutputStream);
|
|
||||||
} catch ( IOException e ) {
|
|
||||||
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void add(VariantContext vc) {
|
|
||||||
super.add(vc);
|
|
||||||
GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes);
|
|
||||||
try {
|
|
||||||
nbytes += gcf.write(dataOutputStream);
|
|
||||||
} catch ( IOException e ) {
|
|
||||||
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() {
|
|
||||||
// todo -- write out VCF header lines
|
|
||||||
GCFHeader gcfHeader = gcfHeaderBuilder.createHeader();
|
|
||||||
try {
|
|
||||||
long headerPosition = nbytes;
|
|
||||||
nbytes += gcfHeader.writeFooter(dataOutputStream);
|
|
||||||
dataOutputStream.close();
|
|
||||||
//System.out.println("Writing forward reference to " + headerPosition);
|
|
||||||
|
|
||||||
RandomAccessFile raFile = new RandomAccessFile(location, "rw");
|
|
||||||
raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET);
|
|
||||||
raFile.writeLong(headerPosition);
|
|
||||||
raFile.close();
|
|
||||||
} catch ( IOException e ) {
|
|
||||||
throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e);
|
|
||||||
}
|
|
||||||
|
|
||||||
super.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final DataOutputStream createDataOutputStream(final OutputStream stream) {
|
|
||||||
return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue