gatk-3.8/java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java

172 lines
7.1 KiB
Java
Raw Normal View History

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.variation;
import java.io.*;
import java.util.*;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.util.TabbedTextFileParser;
import edu.mit.broad.picard.util.Log;
/**
* Generates a binary version of the data for all dbSnps from a UCSU snp###.txt file. Files with SNP data
* can be downloaded here: http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/. See KnownVariantCodec.java
* for binary file format.
*/
public class DbSnpFileGenerator {
// Codes from the DbSnp file that we will handle. All others are ignored.
// Package visibility for testing purposes.
static final String snp = "single"; // code in DbSnp file for a SNP
static final String insertion = "insertion"; // code in DbSnp file for an insertion
static final String deletion = "deletion"; // code in DbSnp file for a deletion
static final String indel = "in-del"; // code in DbSnp file for an insertion/deletion
private File snpFile;
private File seqDictionaryFile;
private Map<String, Integer> sequenceToIndex = new HashMap<String, Integer>();
private List<SAMSequenceRecord> dictionary;
private BinaryCodec codec;
private KnownVariantCodec kvCodec = new KnownVariantCodec();
private Map<String, SortedSet<KnownVariant>> sequenceToSnps;
private final Log log = Log.getInstance(DbSnpFileGenerator.class);
/**
* Protected constructor so we can use a temporary file during testing
* @param snpFile The UCSC dbSnp file
* @param seqDictionaryFile The Sequence Dictionary
* @param tempOutputFile The binary file to write to
*/
DbSnpFileGenerator(File snpFile, File seqDictionaryFile, File tempOutputFile) {
this.snpFile = snpFile;
this.seqDictionaryFile = seqDictionaryFile;
this.codec = new BinaryCodec(new DataOutputStream(IoUtil.openFileForWriting(tempOutputFile)));
}
/**
* Writes the full binary dbSnp file and calls close on the BinaryCodec.
*/
public void writeDbSnpFile() {
kvCodec.encode(KnownVariantCodec.MAGIC_NUMBER, codec);
writeReferenceSequences();
writeDbSnpRecords();
codec.close();
}
/**
* Writes the number of reference sequences and then the sequences themselves
*/
private void writeReferenceSequences() {
SAMFileReader sam = new SAMFileReader(this.seqDictionaryFile);
this.dictionary = sam.getFileHeader().getSequences();
kvCodec.encode(this.dictionary, codec);
}
/**
* Writes all the dbSnp records to the file in the order of the reference sequences
* in the sequence dictionary file.
*/
private void writeDbSnpRecords() {
sequenceToSnps = new HashMap<String, SortedSet<KnownVariant>>();
int count = 0;
TabbedTextFileParser parser = new TabbedTextFileParser(true, snpFile);
while(parser.hasNext()) {
String parts[] = parser.next();
String sequence = parts[1];
// If we don't have this sequence in our dictionary, ignore it
if (!getSequenceToIndex().containsKey(sequence)) {
continue;
}
int start = Integer.parseInt(parts[2]) + 1; // We go from a zero-based to a 1-based system.
int end = Integer.parseInt(parts[3]);
String var = parts[11];
// We only care about SNPs, insertions, and deletions; otherwise skip it
VariantType type = null;
if (var.equals(snp)) {
type = VariantType.SNP;
end = start; // For SNPs, we mark the start and end as the same location
}
// For insertions and deletions, we mark the base on either side of the affected reference sequence
else if (var.equals(insertion)) {
type = VariantType.insertion;
end = start + 1; // Insertions are always length 1
}
else if (var.equals(deletion)) {
type = VariantType.deletion;
start = start - 1;
end++;
}
else if (var.equals(indel)) { // For indels, we do one each of an insertion (here) and a deletion (below)
type = VariantType.insertion;
start = start - 1;
end = start + 1;
}
else {
continue;
}
if (!sequenceToSnps.containsKey(sequence)) {
sequenceToSnps.put(sequence, new TreeSet<KnownVariant>());
}
SortedSet<KnownVariant> sequenceVars = sequenceToSnps.get(sequence);
boolean validated = !parts[12].equals("unknown");
String name = parts[4];
sequenceVars.add(new KnownVariant(name, getSequenceToIndex().get(sequence), start, end, type, validated));
count++;
// If it's an in-del, we add it as a deletion (in addition to the insertion we also added) so we
// will have two records in our binary format for the one record in the text file
if (var.equals(indel)) {
sequenceVars.add(new KnownVariant(name, getSequenceToIndex().get(sequence), start,
Integer.parseInt(parts[3])+1, VariantType.deletion, validated));
count++;
}
}
codec.writeInt(count);
// Loop through the sequences from the sequence dictionary in order
for (int i = 0; i < dictionary.size(); i++) {
// And write their known variants in order
if (sequenceToSnps.containsKey(dictionary.get(i).getSequenceName())) {
for (Iterator<KnownVariant> it = sequenceToSnps.get(dictionary.get(i).getSequenceName()).iterator();
it.hasNext();) {
kvCodec.encode(it.next(), codec);
}
}
}
log.info("Wrote " + count + " dbSnp records.");
}
/**
* Returns the map of sequences to their index in the reference dictionary,
* creating it if it does not already exist
*
* @return the map of sequences to their index in the reference dictionary
*/
private Map<String, Integer> getSequenceToIndex() {
if (sequenceToIndex.keySet().size() == 0) {
for (int i = 0; i < dictionary.size(); i++) {
sequenceToIndex.put(dictionary.get(i).getSequenceName(), i);
}
}
return sequenceToIndex;
}
}