A standalone picard-level tool for breaking individual reads into "pairs" of first/last N bases. Supports:

* splitting off only start or end of the read, or both; the output will contain chopped sequences AND corresponding base qualities * splitting arbitrary number of bases off each end (different numbers for left and right segments can be specified; segments can overlap) * splitting only unmapped reads, ignoring mapped ones * writing splitted ends into separate sam/bam files, or into a single output file * decorating original read names with user-specified suffixes for each end (e.g. _1 and _2 for left and right parts of the read); default: no decoration, original read names are used * when mapped reads are split, the alignment cigars are chopped appropriately and the alignment start positions are adjusted (for the right end) to correctly specify the alignment of the selected part of the read git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1402 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-10 20:42:49 +00:00 · 2009-08-10 20:42:49 +00:00 · 3208eaabcc
parent 36312ae4b2
commit 3208eaabcc
1 changed files with 202 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/playground/tools/SplitReads.java
+++ b/java/src/org/broadinstitute/sting/playground/tools/SplitReads.java
@ -0,0 +1,202 @@
+package org.broadinstitute.sting.playground.tools;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.io.File;
+
+import net.sf.samtools.*;
+
+import net.sf.picard.cmdline.CommandLineProgram;
+import net.sf.picard.cmdline.Usage;
+import net.sf.picard.cmdline.Option;
+
+public class SplitReads extends CommandLineProgram {
+    @Usage(programVersion="1.0") public String USAGE = "Splits reads: extracts sub-sequences of the specified length(s) from left "+
+            "and/or right ends of all the reads into the specified output bam file(s). For the reads in the input that are mapped, "+
+            "the subsequences in the output bam(s) will have appropriately adjusted alignment positions and chopped cigars.";
+    @Option(shortName="I",
+            doc="Input file (bam or sam) with read sequences to split.",
+            optional=false)
+    public File IN = null;
+    @Option(shortName="E", doc="Read end to select, 1=left, 2=right; default: select both ends.",
+    		optional=true) public List<Integer> READ_ENDS = new ArrayList<Integer>();
+    @Option(shortName="N", doc="Number of bases to keep in the corresponding segment of the read. "+
+    			"Synchronized with READ_ENDS argument; if single number is given, all selected segments (ends) will have specified length.",
+    		optional=false) public List<Integer> LENGTH = new ArrayList<Integer>();
+    @Option(shortName="S", doc="Read name for each segment (read end) will be set as original read name followed by the corresponding suffix." +
+    			"Synchronized with READ_ENDS argument and must have the same number of entries if specified (note that default READ_ENDS is a list of (1,2). "+
+    			"By default, suffixes are empty strings, i.e. all segments have the same name(s) as the original read." , optional=true) public List<String> SUFFIXES = new ArrayList<String>();
+    @Option(shortName="O",optional=false, doc="Each read end will be sent into the corresponding file " +
+    		"(synchronized with READ_ENDS). If only one file name is specified, all read segments will be printed into that file."
+    		) public List<File> OUTPUT_BAMS = new ArrayList<File>();
+    @Option(shortName="U", doc="Split and output only unmapped reads; mapped reads will be ignored.",
+    		optional=true) public boolean UNMAPPED = false;
+
+
+    /** Required main method implementation. */
+    public static void main(final String[] argv) {
+        System.exit(new SplitReads().instanceMain(argv));
+    }
+
+    protected int doWork() {
+
+    	// if read ends are not specified explicitly on the cmd line, set default 1,2 (both ends)
+    	if ( READ_ENDS.size() == 0 ) {
+    		READ_ENDS.add(1);
+    		READ_ENDS.add(2);
+    	}
+
+    	for ( Integer i : READ_ENDS) {
+    		if ( ! i.equals(1) && ! i.equals(2)) throw new RuntimeException("Unknown value specified for READ_ENDS: "+i);
+    	}
+    	
+    	// if suffixes are not specified, set them to "", ""
+    	if ( SUFFIXES.size() == 0 ) {
+    		for ( Integer i : READ_ENDS) {
+    			SUFFIXES.add( "" );
+    		}
+    	} else {
+    		// or make sure that the number of suffixes matches the number of ends
+    		if ( SUFFIXES.size() != READ_ENDS.size() ) throw new RuntimeException("Number of suffixes specified must be equal to the number of read ends requested."+
+    				"Passed: "+ READ_ENDS.size() +" READ_ENDS and " + SUFFIXES.size() + " SUFFIXES arguments.");
+    	}
+    	
+    	if ( LENGTH.size() == 1 ) {
+    		// if only one length is specified, apply it to all ends:
+    		LENGTH = Collections.nCopies(READ_ENDS.size(), LENGTH.get(0));
+    	}
+    	
+    	if ( LENGTH.size() != READ_ENDS.size() ) throw new RuntimeException("Number of lengths specified must be equal to the number of read ends requested."+
+				"Passed: "+ READ_ENDS.size() +" READ_ENDS and " + LENGTH.size() + " LENGTH arguments.");
+    	
+    	if ( READ_ENDS.size() != OUTPUT_BAMS.size() && OUTPUT_BAMS.size() != 1 )
+            throw new RuntimeException("Number of output files must be either one, or equal to the number of read ends requested."+
+				"Passed: "+ READ_ENDS.size() +" READ_ENDS and " + OUTPUT_BAMS.size() + " OUTPUT_BAMS arguments.");
+
+        SAMFileReader inReader = new SAMFileReader(IN);
+
+        List<SAMFileWriter> outWriters = new ArrayList<SAMFileWriter>(OUTPUT_BAMS.size());
+        for ( File outName : OUTPUT_BAMS ) {
+            outWriters.add(new SAMFileWriterFactory().makeSAMOrBAMWriter(inReader.getFileHeader(), true, outName)) ;
+        }
+
+
+        for ( SAMRecord read : inReader ) {
+
+            if ( UNMAPPED && ! read.getReadUnmappedFlag() ) continue;
+            
+            for ( int i = 0 ; i < READ_ENDS.size(); i++ ) {
+
+                SAMRecord newRecord = null;
+                try {
+                    newRecord = (SAMRecord)read.clone();
+                } catch (CloneNotSupportedException e) {
+                    throw new RuntimeException("Clone not supported by SAMRecord implementation");
+                }
+
+                final int whichEnd = READ_ENDS.get(i);
+                final int length = LENGTH.get(i);
+                String name = read.getReadName();
+                if ( length > read.getReadLength() ) throw new RuntimeException("Read "+name+" is shorter than the specified length ("+read.getReadLength()+"<"+length+")");
+                int start = 0 , stop = 0; // [start, stop) : segment of the read to be selected; coordinates are wrt read sequence; half-open 0 based
+                switch ( whichEnd ) {
+                case 1: start = 0 ; stop = start + LENGTH.get(i); break;
+                case 2: stop = read.getReadLength() ; start = stop - LENGTH.get(i); break;
+                }
+
+                newRecord.setReadBases(Arrays.copyOfRange(read.getReadBases(),start,stop));
+                newRecord.setBaseQualities(Arrays.copyOfRange(read.getBaseQualities(), start, stop));
+                newRecord.setReadName(name+ SUFFIXES.get(i));
+                if ( read.getReadUnmappedFlag() ) {
+                    //newRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
+                } else {
+                    newRecord.setAlignmentStart(read.getAlignmentStart()+start);
+                    newRecord.setCigar( chopCigar(read.getCigar(), start, length ));
+                }
+
+                if ( outWriters.size() > 1 ) outWriters.get(i).addAlignment(newRecord);
+                else outWriters.get(0).addAlignment(newRecord);
+            }
+
+        }
+
+        for ( SAMFileWriter w : outWriters ) w.close();
+
+        return 0;
+    }
+    
+
+	/**
+	 * Returns new cigar representing segment of the alignment that starts at position <code>start</code> (0-based)
+	 * with respect to the start of the original cigar and covers <code>length</code> bases on the original read the
+	 * <code>origCigar</code> corresponds to (i.e. I elements count, but D do not).
+	 * @param origCigar
+	 * @param start
+	 * @param length
+	 * @return
+	 */
+	private Cigar chopCigar( Cigar origCigar, int start, int length ) {
+
+		int elementEnd = 0; // next base after the end of the current cigar element on the read
+		
+		Cigar newCigar = new Cigar();
+		
+		Iterator<CigarElement> elements = origCigar.getCigarElements().iterator();
+
+        if ( ! elements.hasNext() ) System.out.println("CIGAR HAS NO ELEMENTS!");
+
+		CigarElement ce = null;
+		
+		while ( elementEnd <= start ) { // if we did not reach the start of selected segment yet:
+//            System.out.println("INIT: start="+start+"; length="+length+"; elementEnd="+elementEnd);
+			ce = elements.next();
+			switch ( ce.getOperator() ) {
+			case N:  //
+			case D : // read misses bases wrt the ref, nothing to count on the read
+				break;
+			case I:
+			case M:
+			case S:
+			case H: // all these elements are real bases on the read. Skip them completely if 
+				    // 'start' is past them, or crop if it is inside:
+				elementEnd += ce.getLength(); // 1 base past end of the current element on the read
+
+			}
+		}
+		// at this point we are guaranteed that ce is the element that contains 'start' position;
+		// now we start adding cigar elements:
+
+		// add manually first element, since we need only a part of it after 'start':
+		newCigar.add( new CigarElement(Math.min(elementEnd-start, length), ce.getOperator()) );
+
+		int selectionEnd = start + length;
+//		    System.out.println(origCigar.toString()+": start="+start+"; length="+length+"; selectionEnd="+selectionEnd+"; elementEnd="+elementEnd);
+		while ( elementEnd < selectionEnd ) {
+			ce = elements.next();
+			switch ( ce.getOperator() ) {
+			case N:  //
+			case D : // read misses bases wrt the ref, nothing to count on the read, but the element has to be added:
+				newCigar.add( new CigarElement(ce.getLength(), ce.getOperator()) );				
+				break;
+			case I:
+			case M:
+			case S:
+			case H: // all these elements are real bases on the read. Add them and count them 
+				    // making sure that the last element gets cropped if needed:
+				elementEnd += ce.getLength(); // 1 base past end of the current element on the read
+				if ( elementEnd > selectionEnd ) { // this is the last element we have to consider and it needs to be cropped:
+					newCigar.add( new CigarElement(ce.getLength() -  elementEnd + selectionEnd , ce.getOperator()) );									
+				} else {
+					newCigar.add( new CigarElement(ce.getLength(), ce.getOperator()) );									
+				}
+			}
+			
+		}
+		return newCigar;
+		
+	}
+	
+}