337 lines
12 KiB
Java
337 lines
12 KiB
Java
/*
|
|
* The Broad Institute
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
* This software and its documentation are copyright 2008 by the
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
*
|
|
* This software is supplied without any warranty or guaranteed support whatsoever.
|
|
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
|
* or functionality.
|
|
*/
|
|
package edu.mit.broad.sam;
|
|
|
|
|
|
import edu.mit.broad.sam.util.AsciiLineReader;
|
|
import edu.mit.broad.sam.util.CloseableIterator;
|
|
import edu.mit.broad.sam.util.StringUtil;
|
|
|
|
import java.io.File;
|
|
import java.io.InputStream;
|
|
import java.util.Map;
|
|
|
|
/**
|
|
* Internal class for reading SAM text files.
|
|
*/
|
|
class SAMTextReader
|
|
extends SAMFileReader.ReaderImplementation
|
|
{
|
|
private static final int QNAME_COL = 0;
|
|
private static final int FLAG_COL = 1;
|
|
private static final int RNAME_COL = 2;
|
|
private static final int POS_COL = 3;
|
|
private static final int MAPQ_COL = 4;
|
|
private static final int CIGAR_COL = 5;
|
|
private static final int MRNM_COL = 6;
|
|
private static final int MPOS_COL = 7;
|
|
private static final int ISIZE_COL = 8;
|
|
private static final int SEQ_COL = 9;
|
|
private static final int QUAL_COL = 10;
|
|
|
|
private static final int NUM_REQUIRED_FIELDS = 11;
|
|
|
|
private AsciiLineReader mReader;
|
|
private SAMFileHeader mFileHeader = null;
|
|
private String mCurrentLine = null;
|
|
private RecordIterator mIterator = null;
|
|
private File mFile = null;
|
|
private final TextTagCodec tagCodec = new TextTagCodec();
|
|
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY;
|
|
|
|
SAMTextReader(final InputStream stream) {
|
|
mReader = new AsciiLineReader(stream);
|
|
readHeader();
|
|
}
|
|
|
|
SAMTextReader(final InputStream stream, final File file) {
|
|
this(stream);
|
|
mFile = file;
|
|
}
|
|
|
|
void close() {
|
|
if (mReader != null) {
|
|
try {
|
|
mReader.close();
|
|
} finally {
|
|
mReader = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
SAMFileHeader getFileHeader() {
|
|
return mFileHeader;
|
|
}
|
|
|
|
public SAMFileReader.ValidationStringency getValidationStringency() {
|
|
return validationStringency;
|
|
}
|
|
|
|
public void setValidationStringency(final SAMFileReader.ValidationStringency lenientValidation) {
|
|
this.validationStringency = lenientValidation;
|
|
}
|
|
|
|
CloseableIterator<SAMRecord> getIterator() {
|
|
if (mReader == null) {
|
|
throw new IllegalStateException("File reader is closed");
|
|
}
|
|
if (mIterator != null) {
|
|
throw new IllegalStateException("Iteration in progress");
|
|
}
|
|
mIterator = new RecordIterator();
|
|
return mIterator;
|
|
}
|
|
|
|
CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) {
|
|
throw new UnsupportedOperationException("Cannot query SAM text files");
|
|
}
|
|
|
|
private void readHeader() {
|
|
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
|
mFileHeader = headerCodec.decode(mReader, mFile);
|
|
mCurrentLine = headerCodec.getCurrentLine();
|
|
}
|
|
|
|
private String advanceLine() {
|
|
mCurrentLine = mReader.readLine();
|
|
return mCurrentLine;
|
|
}
|
|
|
|
private String makeErrorString(final String reason) {
|
|
String fileMessage = "";
|
|
if (mFile != null) {
|
|
fileMessage = "File " + mFile + "; ";
|
|
}
|
|
return "Error parsing text SAM file. " + reason + "; " + fileMessage +
|
|
"Line " + mReader.getLineNumber() + "\nLine: " + mCurrentLine;
|
|
}
|
|
|
|
private RuntimeException reportFatalErrorParsingLine(final String reason) {
|
|
return new SAMFormatException(makeErrorString(reason));
|
|
}
|
|
|
|
private void reportErrorParsingLine(final String reason) {
|
|
final String errorMessage = makeErrorString(reason);
|
|
|
|
if (validationStringency == SAMFileReader.ValidationStringency.STRICT) {
|
|
throw new SAMFormatException(errorMessage);
|
|
} else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) {
|
|
System.err.println("Ignoring SAM validation error due to lenient parsing:");
|
|
System.err.println(errorMessage);
|
|
}
|
|
}
|
|
|
|
private void reportErrorParsingLine(final Exception e) {
|
|
final String errorMessage = makeErrorString(e.getMessage());
|
|
if (validationStringency == SAMFileReader.ValidationStringency.STRICT) {
|
|
throw new SAMFormatException(errorMessage);
|
|
} else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) {
|
|
System.err.println("Ignoring SAM validation error due to lenient parsing:");
|
|
System.err.println(errorMessage);
|
|
}
|
|
}
|
|
|
|
private class RecordIterator implements CloseableIterator<SAMRecord> {
|
|
|
|
/**
|
|
* Allocate this once rather than for every line as a performance optimization.
|
|
* The size is arbitrary -- merely large enough to handle the maximum number
|
|
* of fields we might expect from a reasonable SAM file.
|
|
*/
|
|
private final String[] mFields = new String[10000];
|
|
|
|
private SAMRecord mCurrentRecord;
|
|
|
|
private RecordIterator() {
|
|
assert(mReader != null);
|
|
if (mCurrentLine != null) {
|
|
parseLine();
|
|
}
|
|
|
|
}
|
|
|
|
public void close() {
|
|
mCurrentRecord = null;
|
|
SAMTextReader.this.close();
|
|
}
|
|
|
|
public boolean hasNext() {
|
|
return mCurrentRecord != null;
|
|
}
|
|
|
|
public SAMRecord next() {
|
|
if (!hasNext()) {
|
|
throw new IllegalStateException("Cannot call next() on exhausted iterator");
|
|
}
|
|
final SAMRecord ret = mCurrentRecord;
|
|
mCurrentRecord = null;
|
|
advanceLine();
|
|
if (mCurrentLine != null) {
|
|
parseLine();
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
public void remove() {
|
|
throw new UnsupportedOperationException("Not supported: remove");
|
|
}
|
|
|
|
int parseInt(final String s, final String fieldName) {
|
|
final int ret;
|
|
try {
|
|
ret = Integer.parseInt(s);
|
|
} catch (NumberFormatException e) {
|
|
throw reportFatalErrorParsingLine("Non-numeric value in " + fieldName + " column");
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void validateReferenceName(final String rname, final String fieldName) {
|
|
if (fieldName.equals("MRNM") && rname.equals("=")) {
|
|
return;
|
|
}
|
|
if (getFileHeader().getSequences().size() != 0) {
|
|
if (getFileHeader().getSequence(rname) == null) {
|
|
reportErrorParsingLine(fieldName + " '" + rname + "' not found in any SQ record");
|
|
}
|
|
}
|
|
}
|
|
|
|
private void parseLine() {
|
|
final int numFields = StringUtil.split(mCurrentLine, mFields, '\t');
|
|
if (numFields < NUM_REQUIRED_FIELDS) {
|
|
reportErrorParsingLine("Not enough fields");
|
|
}
|
|
if (numFields == mFields.length) {
|
|
reportErrorParsingLine("Too many fields in SAM text record.");
|
|
}
|
|
for (int i = 0; i < numFields; ++i) {
|
|
if (mFields[i].length() == 0) {
|
|
reportErrorParsingLine("Empty field at position " + i + " (zero-based)");
|
|
}
|
|
}
|
|
mCurrentRecord = new SAMRecord();
|
|
mCurrentRecord.setReadName(mFields[QNAME_COL]);
|
|
|
|
final int flags = parseInt(mFields[FLAG_COL], "FLAG");
|
|
mCurrentRecord.setFlags(flags);
|
|
|
|
final String rname = mFields[RNAME_COL];
|
|
if (!rname.equals("*")) {
|
|
validateReferenceName(rname, "RNAME");
|
|
mCurrentRecord.setReferenceName(rname);
|
|
} else if (!mCurrentRecord.getReadUnmappedFlag()) {
|
|
reportErrorParsingLine("RNAME is not specified but flags indicate mapped");
|
|
}
|
|
|
|
final int pos = parseInt(mFields[POS_COL], "POS");
|
|
final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ");
|
|
final String cigar = mFields[CIGAR_COL];
|
|
if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(mCurrentRecord.getReferenceName())) {
|
|
if (pos == 0) {
|
|
reportErrorParsingLine("POS must be non-zero if RNAME is specified");
|
|
}
|
|
if (!mCurrentRecord.getReadUnmappedFlag() && cigar.equals("*")) {
|
|
reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified");
|
|
}
|
|
} else {
|
|
if (pos != 0) {
|
|
reportErrorParsingLine("POS must be zero if RNAME is not specified");
|
|
}
|
|
if (mapq != 0) {
|
|
reportErrorParsingLine("MAPQ must be zero if RNAME is not specified");
|
|
}
|
|
if (!cigar.equals("*")) {
|
|
reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified");
|
|
}
|
|
}
|
|
mCurrentRecord.setAlignmentStart(pos);
|
|
mCurrentRecord.setMappingQuality(mapq);
|
|
mCurrentRecord.setCigarString(cigar);
|
|
|
|
final String mateRName = mFields[MRNM_COL];
|
|
if (mateRName.equals("*")) {
|
|
if (mCurrentRecord.getReadPairedFlag() && !mCurrentRecord.getMateUnmappedFlag()) {
|
|
reportErrorParsingLine("MRNM not specified but flags indicate mate mapped");
|
|
}
|
|
}
|
|
else {
|
|
if (!mCurrentRecord.getReadPairedFlag()) {
|
|
reportErrorParsingLine("MRNM specified but flags indicate unpaired");
|
|
}
|
|
if (mCurrentRecord.getMateUnmappedFlag()) {
|
|
reportErrorParsingLine("MRNM specified but flags indicate mate unmapped");
|
|
}
|
|
|
|
validateReferenceName(mateRName, "MRNM");
|
|
if (mateRName.equals("=")) {
|
|
if (mCurrentRecord.getReferenceName() == null) {
|
|
reportErrorParsingLine("MRNM is '=', but RNAME is not set");
|
|
}
|
|
mCurrentRecord.setMateReferenceName(mCurrentRecord.getReferenceName());
|
|
} else {
|
|
mCurrentRecord.setMateReferenceName(mateRName);
|
|
}
|
|
}
|
|
|
|
final int matePos = parseInt(mFields[MPOS_COL], "MPOS");
|
|
final int isize = parseInt(mFields[ISIZE_COL], "ISIZE");
|
|
if (!mCurrentRecord.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
|
|
if (matePos == 0) {
|
|
reportErrorParsingLine("MPOS must be non-zero if MRNM is specified");
|
|
}
|
|
if (isize == 0 && mCurrentRecord.getReferenceName().equals(mCurrentRecord.getMateReferenceName())) {
|
|
reportErrorParsingLine("ISIZE must be non-zero if RNAME == MRNM");
|
|
}
|
|
} else {
|
|
if (matePos != 0) {
|
|
reportErrorParsingLine("MPOS must be zero if MRNM is not specified");
|
|
}
|
|
if (isize != 0) {
|
|
reportErrorParsingLine("ISIZE must be zero if MRNM is not specified");
|
|
}
|
|
}
|
|
mCurrentRecord.setMateAlignmentStart(matePos);
|
|
mCurrentRecord.setInferredInsertSize(isize);
|
|
if (!mFields[SEQ_COL].equals("*")) {
|
|
mCurrentRecord.setReadString(mFields[SEQ_COL]);
|
|
}
|
|
if (!mFields[QUAL_COL].equals("*")) {
|
|
if (mCurrentRecord.getReadString() == null) {
|
|
reportErrorParsingLine("QUAL should not be specified if SEQ is not specified");
|
|
}
|
|
if (mCurrentRecord.getReadString().length() != mFields[QUAL_COL].length()) {
|
|
reportErrorParsingLine("length(QUAL) != length(SEQ)");
|
|
}
|
|
mCurrentRecord.setBaseQualityString(mFields[QUAL_COL]);
|
|
}
|
|
|
|
for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) {
|
|
parseTag(mFields[i]);
|
|
}
|
|
|
|
}
|
|
|
|
private void parseTag(final String tag) {
|
|
Map.Entry<String, Object> entry = null;
|
|
try {
|
|
entry = tagCodec.decode(tag);
|
|
} catch (SAMFormatException e) {
|
|
reportErrorParsingLine(e);
|
|
}
|
|
if (entry != null) {
|
|
mCurrentRecord.setAttribute(entry.getKey(), entry.getValue());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|