231 lines
8.1 KiB
Java
Executable File
231 lines
8.1 KiB
Java
Executable File
package org.broadinstitute.sting.playground.indels;
|
|
|
|
import java.io.File;
|
|
|
|
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
|
import net.sf.picard.reference.ReferenceSequenceFileWalker;
|
|
import net.sf.picard.cmdline.CommandLineProgram;
|
|
import net.sf.picard.cmdline.Option;
|
|
import net.sf.picard.cmdline.Usage;
|
|
import net.sf.samtools.Cigar;
|
|
import net.sf.samtools.CigarElement;
|
|
import net.sf.samtools.SAMFileReader;
|
|
import net.sf.samtools.SAMRecord;
|
|
|
|
public class ShowMSA extends CommandLineProgram {
|
|
|
|
// Usage and parameters
|
|
@Usage(programVersion="1.0") public String USAGE = "Prints MSA into stdout\n";
|
|
@Option(shortName="I", doc="SAM or BAM file with alignment data") public File INPUT_FILE;
|
|
@Option(shortName="L", doc="Contig:Start-Stop or Contig:poslocation of the window to draw") public String LOCATION;
|
|
@Option(shortName="W", doc="Number of bases on each side of specified position if LOCATION is in Contig:pos format; ignored otherwise", optional=true) public Integer WINDOW;
|
|
@Option(shortName="R", doc="Reference fastb file") public File REF_FILE;
|
|
@Option(shortName="P", doc="If true, then any read (partially) overlapping with the specified region will be shown. "+
|
|
"Otherwise (default), only reads fully contained in the specified interval are shown", optional=true) public Boolean PARTIAL;
|
|
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE;
|
|
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
|
|
@Option(shortName="F",doc="Format: PILE - show alignment, FASTA - print sequences in fasta",optional=true) public String OUT_FORMAT;
|
|
|
|
/** Required main method implementation. */
|
|
public static void main(final String[] argv) {
|
|
System.exit(new ShowMSA().instanceMain(argv));
|
|
}
|
|
|
|
protected int doWork() {
|
|
|
|
if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) {
|
|
System.out.println("Unknown value specified for ERR_MODE");
|
|
return 1;
|
|
}
|
|
|
|
if ( PARTIAL == null ) PARTIAL = new Boolean(false);
|
|
if ( OUT_FORMAT == null ) OUT_FORMAT=new String("PILE");
|
|
|
|
if ( ! OUT_FORMAT.equals("PILE") && ! OUT_FORMAT.equals("FASTA")) {
|
|
System.out.println("OUT_FORMAT can only have values PILE or FASTA");
|
|
return 1;
|
|
}
|
|
|
|
if ( ! INPUT_FILE.exists() ) {
|
|
System.out.println("Specified INPUT_FILE does not exist");
|
|
return 1;
|
|
}
|
|
|
|
if ( ! REF_FILE.exists() ) {
|
|
System.out.println("Specified REF_FILE does not exist");
|
|
return 1;
|
|
}
|
|
|
|
if ( LOCATION.indexOf(':') == -1 ) {
|
|
System.out.println("LOCATION should follow Contig:Start-Stop or Contig:Pos format");
|
|
return 1;
|
|
}
|
|
String[] s1 = LOCATION.split(":");
|
|
int contig;
|
|
try {
|
|
contig = Integer.valueOf(s1[0]);
|
|
} catch (NumberFormatException e) {
|
|
System.out.println("LOCATION: contig must be specified as an integer");
|
|
return 1;
|
|
}
|
|
|
|
if ( s1.length != 2 ) {
|
|
System.out.println("LOCATION should follow Contig:Start-Stop or Contig:Pos format");
|
|
return 1;
|
|
}
|
|
|
|
String s2[] = s1[1].split("-");
|
|
if ( s2.length > 2 ) {
|
|
System.out.println("LOCATION should follow Contig:Start-Stop or Contig:Pos format");
|
|
return 1;
|
|
}
|
|
int left, right;
|
|
if ( s2.length == 2 ) {
|
|
try {
|
|
left = Integer.valueOf(s2[0]);
|
|
right = Integer.valueOf(s2[1]);
|
|
} catch (NumberFormatException e) {
|
|
System.out.println("LOCATION: window boundaries should be specified as integers");
|
|
return 1;
|
|
}
|
|
} else {
|
|
int pos = 0;
|
|
try {
|
|
pos = Integer.valueOf(s2[0]);
|
|
} catch (NumberFormatException e) {
|
|
System.out.println("LOCATION: position on the contig should be specified as an integer");
|
|
return 1;
|
|
}
|
|
if (WINDOW == null ) {
|
|
System.out.println("WINDOW must be specified when LOCATION specifies a single poisiton (Contig:Pos)");
|
|
return 1;
|
|
}
|
|
left = pos - WINDOW.intValue();
|
|
right = pos+WINDOW.intValue();
|
|
}
|
|
|
|
|
|
String ref_contig ;
|
|
|
|
try {
|
|
ReferenceSequenceFileWalker mRefReader =
|
|
new ReferenceSequenceFileWalker(ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE));
|
|
ref_contig = mRefReader.get(contig).toString(); // reload ref
|
|
} catch (Exception e) {
|
|
System.out.println("Failed to read reference sequence from " + REF_FILE);
|
|
return 1;
|
|
}
|
|
|
|
SAMFileReader reader ;
|
|
try {
|
|
reader = new SAMFileReader(INPUT_FILE);
|
|
} catch ( Exception e) {
|
|
System.out.println(e.getMessage());
|
|
return 1;
|
|
}
|
|
|
|
SequencePile msa=null;
|
|
|
|
if ( OUT_FORMAT.equals("PILE")) {
|
|
msa = new SequencePile(ref_contig.substring(left-1, right));
|
|
} else {
|
|
System.out.println(">reference "+contig+":"+left+"-"+right);
|
|
System.out.println(ref_contig.substring(left-1, right));
|
|
}
|
|
|
|
for( SAMRecord r : reader ) {
|
|
if ( r.getReadUnmappedFlag() ) continue;
|
|
if ( r.getReferenceIndex() < contig ) continue;
|
|
if ( r.getReferenceIndex() > contig ) break;
|
|
if ( r.getAlignmentEnd() < left ) continue;
|
|
if ( r.getAlignmentStart() >= right ) break;
|
|
if ( ! PARTIAL && ( r.getAlignmentStart() < left || r.getAlignmentEnd() >= right ) ) continue;
|
|
|
|
int err = -1;
|
|
if ( ERR_MODE.equals("MM")) err = numMismatches(r);
|
|
else if ( ERR_MODE.equals("ERR")) err = numErrors(r);
|
|
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r);
|
|
if ( err > MAX_ERRS ) continue;
|
|
|
|
if ( OUT_FORMAT.equals("PILE") ) {
|
|
msa.addAlignedSequence(r.getReadString(), r.getReadNegativeStrandFlag(), r.getCigar(), r.getAlignmentStart() - left);
|
|
} else {
|
|
System.out.print(">read "+r.getReadName());
|
|
if ( r.getReadNegativeStrandFlag() ) System.out.println("(rc)");
|
|
else System.out.println("(fw)");
|
|
System.out.println(r.getReadString());
|
|
}
|
|
}
|
|
|
|
if ( OUT_FORMAT.equals("PILE") ) msa.colorprint();
|
|
//// System.out.println(msa.format());
|
|
|
|
return 0;
|
|
}
|
|
|
|
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
|
|
*
|
|
* @param r SAM record that must specify an alignment
|
|
* @return number of errors (number of mismatches plus total length of all insertions/deletions
|
|
* @throws RuntimeException if cigar contains any elements other than M,I,D
|
|
*/
|
|
private static int numErrors(SAMRecord r) throws RuntimeException {
|
|
|
|
// NM currently stores the total number of mismatches in all blocks + 1
|
|
int errs = numMismatches(r);
|
|
|
|
// now we have to add the total length of all indels:
|
|
Cigar c = r.getCigar();
|
|
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
|
CigarElement ce = c.getCigarElement(i);
|
|
switch( ce.getOperator()) {
|
|
case M : break; // we already have correct number of mismatches
|
|
case I :
|
|
case D :
|
|
errs += ce.getLength();
|
|
break;
|
|
default: throw new RuntimeException("Unrecognized cigar element");
|
|
}
|
|
}
|
|
return errs;
|
|
}
|
|
|
|
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD
|
|
*
|
|
* @param r SAM record that must specify an alignment
|
|
* @return number of errors (number of mismatches plus total number of all insertions/deletions (each insertion or
|
|
* deletion will be counted as a single error regardless of the length)
|
|
* @throws RuntimeException if cigar contains any elements other than M,I,D
|
|
*/
|
|
private static int numMismatchesGaps(SAMRecord r) throws RuntimeException {
|
|
|
|
// NM currently stores the total number of mismatches in all blocks + 1
|
|
int errs = numMismatches(r);
|
|
|
|
// now we have to add the total length of all indels:
|
|
Cigar c = r.getCigar();
|
|
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
|
CigarElement ce = c.getCigarElement(i);
|
|
switch( ce.getOperator()) {
|
|
case M : break; // we already have correct number of mismatches
|
|
case I :
|
|
case D :
|
|
errs++;
|
|
break;
|
|
default: throw new RuntimeException("Unrecognized cigar element");
|
|
}
|
|
}
|
|
return errs;
|
|
}
|
|
|
|
|
|
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
|
|
private static int numMismatches(SAMRecord r) throws RuntimeException {
|
|
|
|
// NM currently stores the total number of mismatches in all blocks + 1
|
|
return ((Integer)r.getAttribute("NM")).intValue() - 1;
|
|
|
|
}
|
|
|
|
}
|