358 lines
13 KiB
Java
358 lines
13 KiB
Java
/*
|
|
* The Broad Institute
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
* This software and its documentation are copyright 2009 by the
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
*
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
*/
|
|
package edu.mit.broad.picard.aligner.maq;
|
|
|
|
import edu.mit.broad.sam.*;
|
|
import edu.mit.broad.sam.util.CloseableIterator;
|
|
import edu.mit.broad.sam.util.BinaryCodec;
|
|
import edu.mit.broad.sam.util.StringUtil;
|
|
import edu.mit.broad.picard.io.IoUtil;
|
|
import edu.mit.broad.picard.PicardException;
|
|
import edu.mit.broad.picard.util.SamPairUtil;
|
|
|
|
import java.io.File;
|
|
import java.io.BufferedInputStream;
|
|
import java.util.*;
|
|
|
|
/**
|
|
* Reads a Maq map file and returns an an iterator of SAMRecords and a populated header
|
|
*
|
|
* IMPORTANT! Even though the reads in the map file are in coordinate order, this iterator
|
|
* will not necessarily return them in that order. For paired reads, both will be
|
|
* returned only after *both* records have been seen.
|
|
*
|
|
* @author Kathleen Tibbetts
|
|
*/
|
|
public class MapFileIterator implements CloseableIterator<SAMRecord> {
|
|
|
|
public static final int MATE_UNMAPPED_FLAG = 64;
|
|
public static final int READ_UNMAPPED_FLAG = 192;
|
|
|
|
private static final int READ_NAME_LENGTH = 36;
|
|
private static final int MAP_FORMAT = -1;
|
|
private static final int MAX_READ_LENGTH = 128;
|
|
|
|
private static final byte ACGT[] = {'A', 'C', 'G', 'T'};
|
|
|
|
public static final String PROGRAM_RECORD = "0";
|
|
|
|
private long recordCount = 0L;
|
|
private int recordsRead = 0;
|
|
private BinaryCodec mapCodec;
|
|
private final SAMFileHeader header;
|
|
private final boolean pairedReads;
|
|
private final boolean jumpingLibrary;
|
|
private final List<SAMRecord> next = new ArrayList<SAMRecord>();
|
|
private final Map<String, SAMRecord> pending = new HashMap<String, SAMRecord>();
|
|
private final List<File> mapFiles = new LinkedList<File>();
|
|
|
|
/**
|
|
* Constructor. Opens the map file, reads the record count and header from it,
|
|
* creates the SAMFileHeader, and queues up the first read
|
|
*
|
|
* @param mapFile The Maq map file to read
|
|
* @param commandLine The command line used to invoke Maq (for the header)
|
|
* @param pairedReads Whether this is a paired-end run
|
|
*/
|
|
public MapFileIterator(String commandLine, boolean pairedReads, boolean jumpingLibrary, File... mapFile) {
|
|
if (mapFile.length == 0) {
|
|
throw new IllegalArgumentException("At least one map file must be provided.");
|
|
}
|
|
mapFiles.addAll(Arrays.asList(mapFile));
|
|
|
|
this.pairedReads = pairedReads;
|
|
this.jumpingLibrary = jumpingLibrary;
|
|
|
|
header = new SAMFileHeader();
|
|
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
|
SAMProgramRecord program = new SAMProgramRecord(PROGRAM_RECORD);
|
|
program.setProgramVersion(MaqConstants.getProgramVersion());
|
|
program.setCommandLine(commandLine);
|
|
header.addProgramRecord(program);
|
|
|
|
queueNextMapFile();
|
|
}
|
|
|
|
/**
|
|
* Queues up the next map file
|
|
*
|
|
* @return true if there's another map file to iterate over
|
|
*/
|
|
private boolean queueNextMapFile() {
|
|
|
|
// Close the old file
|
|
if (mapCodec != null) {
|
|
mapCodec.close();
|
|
}
|
|
|
|
// If there are no more map files, return fales
|
|
if (mapFiles.size() == 0) {
|
|
return false;
|
|
}
|
|
|
|
// Otherwise, open the next file and reset the recordsRead count
|
|
mapCodec = new BinaryCodec(new BufferedInputStream(IoUtil.openFileForReading(mapFiles.remove(0))));
|
|
int format = mapCodec.readInt();
|
|
if (format != MAP_FORMAT) {
|
|
mapCodec.close();
|
|
throw new PicardException("Unrecognized Maq map file format: " + format);
|
|
}
|
|
recordsRead = 0;
|
|
|
|
|
|
// Read the sequences out of the map file and set them on the header
|
|
int sequenceCount = mapCodec.readInt();
|
|
List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>();
|
|
for (int i = 0; i < sequenceCount; i++) {
|
|
int length = mapCodec.readInt();
|
|
// Write the sequence name, trimming off the null terminator
|
|
sequences.add(new SAMSequenceRecord(mapCodec.readString(length).substring(0, length-1)));
|
|
}
|
|
if (header.getSequences() == null || header.getSequences().size() == 0) {
|
|
header.setSequences(sequences);
|
|
}
|
|
else {
|
|
// TODO: Check that the sequences match and throw and exception if they don't
|
|
}
|
|
recordCount = mapCodec.readLong();
|
|
|
|
readNext();
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Closes the BinaryCodec reading the map file
|
|
*/
|
|
public void close() {
|
|
mapCodec.close();
|
|
}
|
|
|
|
/**
|
|
* @return true if the iteration has more elements
|
|
*/
|
|
public boolean hasNext() {
|
|
return next.size() > 0;
|
|
}
|
|
|
|
/**
|
|
* @return the next SAMRecord in the iteration
|
|
* @throws NoSuchElementException if this is called when hasNext() returns false
|
|
*/
|
|
public SAMRecord next() {
|
|
if (!hasNext()) {
|
|
throw new NoSuchElementException("No more elements in this iteration");
|
|
}
|
|
SAMRecord result = next.remove(0);
|
|
readNext();
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Reads the next element from the map file. If we are done with it, we put it in the <code>next</code>
|
|
* list; if we are waiting to see its mate, we put it in the <code>pending</code> map. Calls itself
|
|
* repeatedly until there is at least one element in <code>next</code>.
|
|
*/
|
|
private void readNext() {
|
|
|
|
// If there's already a record queued up, just return
|
|
if (next.size() > 0) {
|
|
return;
|
|
}
|
|
|
|
// If we've read all there is, then any remaining records in the pending map should be returned.
|
|
// If this is not a PE run, then the pending map will be empty and we're done.
|
|
if (recordsRead == recordCount) {
|
|
if (pending.size() > 0) {
|
|
StringBuffer sb = new StringBuffer();
|
|
for (String item : pending.keySet()) {
|
|
sb.append(item).append("\n");
|
|
}
|
|
throw new PicardException("MapFileIterator pending map should have been empty but contained " +
|
|
"the following records: " + sb.toString());
|
|
}
|
|
queueNextMapFile();
|
|
return;
|
|
}
|
|
|
|
// Otherwise, we read until there is at least one record in the <code>next</code> list
|
|
readMapRecord();
|
|
if (next.size() == 0) {
|
|
readNext();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reads one record from the map file and throws it onto the pending map or the next list,
|
|
* depending on whether we have already seen its mate
|
|
*/
|
|
private void readMapRecord() {
|
|
|
|
// Now that we've got all the data from the binary file, write a SAMRecord and add it to
|
|
// the new BAM file
|
|
SAMRecord record = new SAMRecord();
|
|
record.setAttribute(SAMTag.PG.toString(), PROGRAM_RECORD);
|
|
record.setReadPairedFlag(this.pairedReads);
|
|
|
|
// the last base is the single-end mapping quality.
|
|
byte seqsAndQuals[] = new byte[MAX_READ_LENGTH-1];
|
|
mapCodec.readBytes(seqsAndQuals);
|
|
|
|
byte singleEndMappingQualityOrIndelLength = mapCodec.readByte();
|
|
|
|
// the length of the read
|
|
int readLength = mapCodec.readUByte();
|
|
setSeqsAndQuals(seqsAndQuals, readLength, record);
|
|
|
|
// the final mapping quality (unless <code>flag</code> below is 130, then it is the
|
|
// position of the indel (or 0 if no indel)
|
|
int mappingQuality = mapCodec.readUByte();
|
|
|
|
// mismatches in the 28bp (higher 4 bits) and mismatches (lower 4 bits)
|
|
mapCodec.readUByte();
|
|
// sum of errors of the best hit
|
|
mapCodec.readUByte();
|
|
// counts of all 0- and 1-mismatch hits on the reference
|
|
mapCodec.readUByte();
|
|
mapCodec.readUByte();
|
|
|
|
// A bitwise flag. See the Maq docs for its full meaning
|
|
int flag = mapCodec.readUByte();
|
|
|
|
// the lower mapQ of the two ends (equals map_qual if unpaired); if flag is 130: mapQ of its mate
|
|
int altQual = mapCodec.readUByte();
|
|
|
|
// Index of the sequence for this read
|
|
record.setReferenceIndex((int)mapCodec.readUInt(), getHeader());
|
|
|
|
// Start position and strand
|
|
long pos = mapCodec.readUInt();
|
|
int startPos = ((int)((pos>>1)& 0x7FFFFFFF)) + 1;
|
|
record.setAlignmentStart(startPos);
|
|
record.setReadNegativeStrandFlag((pos&1) == 1);
|
|
|
|
// offset of the mate (zero if unpaired, or two ends mapped to different chr)
|
|
mapCodec.readInt();
|
|
|
|
// The read name
|
|
byte nameBytes[] = new byte[READ_NAME_LENGTH];
|
|
mapCodec.readBytes(nameBytes);
|
|
String name = StringUtil.bytesToString(nameBytes).trim();
|
|
if (this.pairedReads) {
|
|
if (name.endsWith("/1")) {
|
|
record.setFirstOfPairFlag(true);
|
|
record.setSecondOfPairFlag(false);
|
|
}
|
|
else if (name.endsWith("/2")) {
|
|
record.setFirstOfPairFlag(false);
|
|
record.setSecondOfPairFlag(true);
|
|
}
|
|
else {
|
|
throw new PicardException("Unrecognized ending for paired read name: " + name);
|
|
}
|
|
name = name.substring(0, name.length()-2);
|
|
}
|
|
record.setReadName(name);
|
|
|
|
|
|
if (flag != 130 || singleEndMappingQualityOrIndelLength == 0) { // No indel
|
|
record.setCigarString(readLength + "M");
|
|
record.setMappingQuality(mappingQuality);
|
|
}
|
|
else { // Indel
|
|
int indelPos = mappingQuality;
|
|
String cigar = indelPos + "M" + Math.abs(singleEndMappingQualityOrIndelLength);
|
|
int remaining = readLength - indelPos;
|
|
if (singleEndMappingQualityOrIndelLength > 0) {
|
|
cigar += "I" + (remaining - singleEndMappingQualityOrIndelLength) + "M";
|
|
}
|
|
else {
|
|
cigar += "D" + remaining + "M";
|
|
}
|
|
record.setCigarString(cigar);
|
|
// In the docs, it look like there is a mapping quality for the mate, do we use that?
|
|
record.setMappingQuality(altQual);
|
|
}
|
|
|
|
if (!pairedReads) {
|
|
record.setProperPairFlag(false);
|
|
next.add(record);
|
|
}
|
|
else {
|
|
record.setMateUnmappedFlag(flag == MATE_UNMAPPED_FLAG);
|
|
SAMRecord mate = pending.remove(record.getReadName());
|
|
|
|
if (mate != null) {
|
|
boolean proper = SamPairUtil.isProperPair(record, mate, jumpingLibrary);
|
|
record.setProperPairFlag(proper);
|
|
mate.setProperPairFlag(proper);
|
|
|
|
SamPairUtil.setMateInfo(record, mate);
|
|
|
|
int insertSize = SamPairUtil.computeInsertSize(record, mate);
|
|
record.setInferredInsertSize(insertSize);
|
|
mate.setInferredInsertSize(insertSize);
|
|
|
|
if (!mate.getMateUnmappedFlag()) {
|
|
next.add(record);
|
|
}
|
|
if (!record.getMateUnmappedFlag()) {
|
|
next.add(mate);
|
|
}
|
|
}
|
|
else {
|
|
pending.put(record.getReadName(), record);
|
|
}
|
|
}
|
|
|
|
// TODO: Figure out what do do about noise reads long-term
|
|
// Note that it is possible that we have lost a "Noise read" annotation at this point. Since
|
|
// we try to map a pair if only one of the reads is classified as "noise", then for any paired
|
|
// reads where one was a noise read and one was not, we will lose the noise annotation on the
|
|
// one noisy read. We have discussed either re-doing the noise evaluation here, modifying the
|
|
// read name to carry the noise flag through Maq, or changing what reads we give to Maq.
|
|
|
|
recordsRead++;
|
|
|
|
}
|
|
|
|
/**
|
|
* Decodes the sequence and the qualities and sets them on the SAMrecords
|
|
*
|
|
* @param seqsAndQuals the list of seqs and quals
|
|
* @param readLength the length of the read
|
|
* @param sam the SAMRecord to populate
|
|
*/
|
|
private void setSeqsAndQuals(byte seqsAndQuals[], int readLength, SAMRecord sam) {
|
|
byte sequence[] = new byte[readLength];
|
|
byte qualities[] = new byte[readLength];
|
|
for (int i = 0; i < readLength; i++) {
|
|
byte b = seqsAndQuals[i];
|
|
qualities[i] = (byte)(b & 0x3F);
|
|
if (b == 0) {
|
|
sequence[i] = 'N';
|
|
}
|
|
else {
|
|
sequence[i] = ACGT[(seqsAndQuals[i] >> 6) & 3];
|
|
}
|
|
}
|
|
sam.setReadBases(sequence);
|
|
sam.setBaseQualities(qualities);
|
|
}
|
|
|
|
/**
|
|
* @throws UnsupportedOperationException -- not implemented
|
|
*/
|
|
public void remove() {
|
|
throw new UnsupportedOperationException("remove() not supported in MapFileIterator");
|
|
}
|
|
|
|
public SAMFileHeader getHeader() { return header; }
|
|
}
|