349 lines
15 KiB
Java
349 lines
15 KiB
Java
/*
|
|
* The Broad Institute
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
* This software and its documentation are copyright 2008 by the
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
*
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
*/
|
|
package edu.mit.broad.picard.illumina;
|
|
|
|
import java.io.File;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.List;
|
|
import java.util.Iterator;
|
|
|
|
import edu.mit.broad.picard.util.*;
|
|
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
|
import edu.mit.broad.picard.cmdline.Usage;
|
|
import edu.mit.broad.picard.cmdline.Option;
|
|
import edu.mit.broad.picard.cmdline.CommandLineParser;
|
|
import edu.mit.broad.sam.SAMFileHeader;
|
|
import edu.mit.broad.sam.SAMFileReader;
|
|
import edu.mit.broad.sam.SAMFileWriter;
|
|
import edu.mit.broad.sam.SAMFileWriterFactory;
|
|
import edu.mit.broad.sam.SAMProgramRecord;
|
|
import edu.mit.broad.sam.SAMReadGroupRecord;
|
|
import edu.mit.broad.sam.SAMRecord;
|
|
|
|
/**
|
|
* Read alignments for a lane (paired or unpaired) from Gerald directory and write to SAM file.
|
|
*/
|
|
public class GeraldToSam extends CommandLineProgram {
|
|
|
|
// These are all written to the SAM header
|
|
private static final String DEFAULT_CN = "broad";
|
|
private static final String DEFAULT_PL = "illumina";
|
|
private static final String PROGRAM_VERSION = "1.0";
|
|
private static final String READ_GROUP_ID = "0";
|
|
private static final String PROGRAM_RECORD_ID = "0";
|
|
private static final String UNKNOWN_SAMPLE = "N/A";
|
|
|
|
private static final Log log = Log.getInstance(GeraldToSam.class);
|
|
|
|
// The following attributes define the command-line arguments
|
|
@Usage(programVersion=PROGRAM_VERSION)
|
|
public String USAGE =
|
|
getStandardUsagePreamble() +
|
|
"Read Gerald alignments for the given lane, and write in SAM format, coordinate sorted.\n";
|
|
|
|
@Option(shortName = "G", doc = "Location of Gerald files.")
|
|
public File GERALD_DIR;
|
|
|
|
@Option(shortName = "L")
|
|
public Integer LANE;
|
|
|
|
@Option(shortName = "M", doc = "Translates from Gerald alignment coordinates to genomic coordinates.")
|
|
public File SQUASHED_MAP;
|
|
|
|
@Option(shortName = "D", doc = "Input SAM or BAM file defining the names, sizes and order of the reference contig, " +
|
|
"and other reference metadata.")
|
|
public File SEQUENCE_DICT;
|
|
|
|
@Option(shortName = "O", doc = "SAM or BAM file to be written (file extension determines format).")
|
|
public File OUTPUT;
|
|
|
|
@Option(doc = "Populates SM field of read group. Use pool name when a pool is being sequenced. " +
|
|
"If any other read group fields are specified, then this is required.")
|
|
public String SAMPLE = UNKNOWN_SAMPLE;
|
|
|
|
@Option(doc = "Populates LB field of read group.")
|
|
public String LIBRARY;
|
|
|
|
@Option(doc = "Populates DS field of read group.", optional = true)
|
|
public String DESCRIPTION;
|
|
|
|
@Option(doc = "Flowcell.lane. Populates PU field of read group.")
|
|
public String RUN;
|
|
|
|
@Option(doc = "Predicted median insert size (may be different from the actual median insert size. " +
|
|
"Populates the PI field of read group.", optional = true)
|
|
public Integer PI;
|
|
|
|
@Option(doc = "Sequencing center that produced the reads. Populates CN field of read group.")
|
|
public String CN = DEFAULT_CN;
|
|
|
|
@Option(doc = "Date the run was produced. Populates the DT field of read group.")
|
|
public Date RUN_DATE;
|
|
|
|
@Option(doc = "Platform/technology used to produce the reads. Populates the PL field of read group")
|
|
public String PL = DEFAULT_PL;
|
|
|
|
@Option(shortName = "JUMPING", doc = "True if this is a jumping library")
|
|
public Boolean JUMPING_LIBRARY = Boolean.FALSE;
|
|
|
|
@Option(doc = "String to put in the PG:CL header field. If not present, the GeraldToSam command line is put there",
|
|
optional = true)
|
|
public String ALIGNMENT_COMMAND;
|
|
|
|
@Option(doc = "Write no more than this number of alignment records. Default: Write all the alignment records",
|
|
optional = true)
|
|
public Integer MAX_ALIGNMENTS;
|
|
|
|
private SAMFileWriter writer;
|
|
SAMFileHeader header;
|
|
private boolean paired;
|
|
|
|
|
|
public static void main(final String[] argv) {
|
|
System.exit(new GeraldToSam().instanceMain(argv));
|
|
}
|
|
|
|
@Override
|
|
public int doWork() {
|
|
makeHeader(clp.getArgv());
|
|
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT);
|
|
writeAlignments();
|
|
writer.close();
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* If any of the read group options are specified on the command line, then SAMPLE must be specified.
|
|
* This is currently not doing anything because SAMPLE has a non-null default value.
|
|
* @return false if there is a problem with the command line
|
|
*/
|
|
@Override
|
|
protected boolean customCommandLineValidation() {
|
|
if (SAMPLE == null &&
|
|
(LIBRARY != null || DESCRIPTION != null || RUN != null || PI != null || !CN.equals(DEFAULT_CN)
|
|
|| RUN_DATE != null || !PL.equals(DEFAULT_PL)
|
|
)) {
|
|
System.err.println("SAMPLE must be specified if any read group options are used.");
|
|
clp.usage(System.err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Create the SAMFileHeader given the cmd-line args
|
|
* @param argv
|
|
*/
|
|
private void makeHeader(final String[] argv) {
|
|
header = new SAMFileHeader();
|
|
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
|
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_ID);
|
|
programRecord.setProgramVersion(PROGRAM_VERSION);
|
|
String commandLine = ALIGNMENT_COMMAND;
|
|
if (commandLine == null) {
|
|
commandLine = StringUtil.join(" ", argv);
|
|
}
|
|
programRecord.setCommandLine(commandLine);
|
|
header.addProgramRecord(programRecord);
|
|
|
|
final SAMFileReader sequenceDictionary = new SAMFileReader(SEQUENCE_DICT);
|
|
final SAMFileHeader sequenceDictionaryHeader = sequenceDictionary.getFileHeader();
|
|
header.setSequences(sequenceDictionaryHeader.getSequences());
|
|
|
|
if (SAMPLE != null) {
|
|
final SAMReadGroupRecord readGroup = new SAMReadGroupRecord(READ_GROUP_ID);
|
|
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
|
|
readGroups.add(readGroup);
|
|
readGroup.setSample(SAMPLE);
|
|
if (LIBRARY != null) {
|
|
readGroup.setLibrary(LIBRARY);
|
|
}
|
|
setRGAttributeIfNotNull(readGroup, DESCRIPTION, "DS");
|
|
setRGAttributeIfNotNull(readGroup, RUN, "PU");
|
|
setRGAttributeIfNotNull(readGroup, PI, SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG);
|
|
setRGAttributeIfNotNull(readGroup, CN, "CN");
|
|
setRGAttributeIfNotNull(readGroup, RUN_DATE, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG);
|
|
setRGAttributeIfNotNull(readGroup, PL, "PL");
|
|
header.setReadGroups(readGroups);
|
|
}
|
|
}
|
|
|
|
private void setRGAttributeIfNotNull(final SAMReadGroupRecord readGroup, final Object value, final String key) {
|
|
if (value == null) {
|
|
return;
|
|
}
|
|
readGroup.setAttribute(key, value);
|
|
}
|
|
|
|
/**
|
|
* Iterate through the Gerald output and write alignments. eland_extended.txt and export.txt are
|
|
* iterated together using PasteParser. If paired end lane, then two PasteParsers are iterated in tandem,
|
|
* so that mate info is available when a SAMRecord is created.
|
|
*/
|
|
private void writeAlignments() {
|
|
final GeraldParserFactory geraldParserFactory = new GeraldParserFactory(GERALD_DIR, LANE, SQUASHED_MAP);
|
|
paired = geraldParserFactory.isPairedRun();
|
|
final GeraldParser firstEndIterator = geraldParserFactory.makeParser(paired ? 1: null);
|
|
GeraldParser secondEndIterator = null;
|
|
if (paired) {
|
|
secondEndIterator = geraldParserFactory.makeParser(2);
|
|
}
|
|
int numAlignmentsOrPairsWritten = 0;
|
|
while (firstEndIterator.hasNext()) {
|
|
final GeraldParser.GeraldAlignment firstEnd = firstEndIterator.next();
|
|
GeraldParser.GeraldAlignment secondEnd = null;
|
|
if (paired) {
|
|
hasNextAssert(secondEndIterator);
|
|
secondEnd = secondEndIterator.next();
|
|
}
|
|
final SAMRecord firstEndAlignment = createSAMRecordFromGerald(firstEnd);
|
|
SAMRecord secondEndAlignment = null;
|
|
if (paired) {
|
|
secondEndAlignment = createSAMRecordFromGerald(secondEnd);
|
|
setMateInfo(secondEndAlignment, firstEnd);
|
|
setMateInfo(firstEndAlignment, secondEnd);
|
|
secondEndAlignment.setSecondOfPairFlag(true);
|
|
firstEndAlignment.setFirstOfPairFlag(true);
|
|
final boolean properPair = SamPairUtil.isProperPair(firstEndAlignment, secondEndAlignment, JUMPING_LIBRARY);
|
|
firstEndAlignment.setProperPairFlag(properPair);
|
|
secondEndAlignment.setProperPairFlag(properPair);
|
|
int insertSize = SamPairUtil.computeInsertSize(firstEndAlignment, secondEndAlignment);
|
|
firstEndAlignment.setInferredInsertSize(insertSize);
|
|
secondEndAlignment.setInferredInsertSize(-insertSize);
|
|
}
|
|
|
|
writer.addAlignment(firstEndAlignment);
|
|
if (secondEndAlignment != null) {
|
|
writer.addAlignment(secondEndAlignment);
|
|
}
|
|
++numAlignmentsOrPairsWritten;
|
|
if (MAX_ALIGNMENTS != null && numAlignmentsOrPairsWritten >= MAX_ALIGNMENTS) {
|
|
break;
|
|
}
|
|
if (numAlignmentsOrPairsWritten % 500000 == 0) {
|
|
log.info("Loaded " + numAlignmentsOrPairsWritten + " reads");
|
|
}
|
|
}
|
|
if (MAX_ALIGNMENTS == null) {
|
|
noMoreAssert(firstEndIterator);
|
|
if (paired) {
|
|
noMoreAssert(secondEndIterator);
|
|
}
|
|
}
|
|
log.info("Done loading " + numAlignmentsOrPairsWritten + " reads");
|
|
}
|
|
|
|
/**
|
|
* Write into the samRecord the mate info from the mate gerald alignment
|
|
*/
|
|
private void setMateInfo(final SAMRecord samRecord, final GeraldParser.GeraldAlignment mateGeraldAlignment) {
|
|
final boolean isMapped = mateGeraldAlignment.getPrimaryChrom() != null;
|
|
if (isMapped) {
|
|
samRecord.setMateReferenceName(mateGeraldAlignment.getPrimaryChrom());
|
|
samRecord.setMateAlignmentStart((int)mateGeraldAlignment.getPrimaryStart());
|
|
samRecord.setMateNegativeStrandFlag(isNegativeStrand(mateGeraldAlignment));
|
|
} else {
|
|
samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
|
samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
|
samRecord.setMateUnmappedFlag(true);
|
|
}
|
|
}
|
|
|
|
private boolean isNegativeStrand(final GeraldParser.GeraldAlignment alignment) {
|
|
final String orientation = alignment.getOrientation();
|
|
if (orientation.equals("F")) {
|
|
return false;
|
|
} else if (orientation.equals("R")) {
|
|
return true;
|
|
} else {
|
|
throw new RuntimeException("Strange orientation in eland_extended file");
|
|
}
|
|
}
|
|
|
|
private SAMRecord createSAMRecordFromGerald(final GeraldParser.GeraldAlignment alignment) {
|
|
final SAMRecord samRecord = new SAMRecord();
|
|
// Consider an alignment with a negative start (i.e. that hangs off the beginning of the contig)
|
|
// to be unmapped.
|
|
final boolean isMapped = alignment.getPrimaryChrom() != null && alignment.getPrimaryStart() >= 0;
|
|
|
|
String readName = alignment.getReadName();
|
|
if (readName.endsWith("/1") || readName.endsWith("/2")) {
|
|
readName = readName.substring(0, readName.length() - 2);
|
|
}
|
|
samRecord.setReadName(readName);
|
|
|
|
// Set all the flags
|
|
samRecord.setReadPairedFlag(paired);
|
|
samRecord.setReadUmappedFlag(!isMapped);
|
|
if (isMapped) {
|
|
samRecord.setReadNegativeStrandFlag(isNegativeStrand(alignment));
|
|
}
|
|
// For now we are only taking the primary alignment
|
|
samRecord.setNotPrimaryAlignmentFlag(false);
|
|
String readBases = alignment.getReadBases();
|
|
if (samRecord.getReadNegativeStrandFlag()) {
|
|
readBases = SequenceUtil.reverseComplement(readBases);
|
|
}
|
|
samRecord.setReadString(readBases);
|
|
final byte[] phredQualities = alignment.getPhredQualities();
|
|
if (isMapped && samRecord.getReadNegativeStrandFlag()) {
|
|
ArrayUtil.reverseArray(phredQualities);
|
|
}
|
|
samRecord.setBaseQualities(phredQualities);
|
|
if (isMapped) {
|
|
/*
|
|
if ("23".equals(geraldReferenceName)) {
|
|
geraldReferenceName = "X";
|
|
} else if ("24".equals(geraldReferenceName)) {
|
|
geraldReferenceName = "Y";
|
|
}
|
|
return REFERENCE_PREFIX + geraldReferenceName;
|
|
*/
|
|
samRecord.setReferenceName(alignment.getPrimaryChrom());
|
|
samRecord.setAlignmentStart((int)alignment.getPrimaryStart());
|
|
samRecord.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY);
|
|
// CIGAR is trivial because there are no indels or clipping in Gerald
|
|
final String cigar = Integer.toString(alignment.getReadLength()) + "M";
|
|
samRecord.setCigarString(cigar);
|
|
// We've decided not to bother with this, and just load the reference
|
|
// if we want to determine mismatches.
|
|
// samRecord.setAttribute("MD", alignment.getMismatchString());
|
|
} else {
|
|
samRecord.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
|
samRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
|
samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
|
|
samRecord.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
|
|
}
|
|
|
|
if (SAMPLE != null) {
|
|
// There is a read group (id = READ_GROUP_ID)
|
|
samRecord.setAttribute("RG", READ_GROUP_ID);
|
|
}
|
|
|
|
samRecord.setAttribute("PG", PROGRAM_RECORD_ID);
|
|
return samRecord;
|
|
}
|
|
|
|
private void hasNextAssert(final Iterator iterator) {
|
|
if (!iterator.hasNext()) {
|
|
throw new RuntimeException("gerald output file ends unexpectedly.");
|
|
|
|
}
|
|
}
|
|
|
|
private void noMoreAssert(final Iterator iterator) {
|
|
if (iterator.hasNext()) {
|
|
throw new RuntimeException("gerald output file has more lines than expected.");
|
|
}
|
|
}
|
|
|
|
}
|