diff --git a/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java b/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java
new file mode 100644
index 000000000..59518276e
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java
@@ -0,0 +1,512 @@
+package org.broadinstitute.sting.playground.tools;
+
+import net.sf.picard.cmdline.CommandLineProgram;
+import net.sf.picard.cmdline.Usage;
+import net.sf.picard.cmdline.Option;
+import net.sf.samtools.*;
+
+import java.io.File;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.broadinstitute.sting.utils.Pair;
+import org.broadinstitute.sting.utils.AlignmentUtils;
+import org.broadinstitute.sting.playground.utils.ParallelSAMIterator;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Aug 27, 2009
+ * Time: 3:53:48 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class PairMaker extends CommandLineProgram {
+ @Usage(programVersion="1.0") public String USAGE = "Reconstitutes mate pairs from alignments"+
+ " for individual fragment ends. Individual end alignment files are expected to be sorted by"+
+ " read name. Multiple alignments are allowed and in this case the single best pairing will be selected.";
+ @Option(shortName="I1",
+ doc="Input file (bam or sam) with alignments for end 1 (first mate in a pair).",
+ optional=false)
+ public File IN1 = null;
+ @Option(shortName="I2",
+ doc="Input file (bam or sam) with alignments for end 2 (second mate in a pair).",
+ optional=false)
+ public File IN2 = null;
+ @Option(shortName="O",optional=false, doc="Output file to write found/selected pairs into.")
+ public File OUTPUT = null;
+ @Option(shortName="F",doc="Turns on a 'filter' mode: only records/pairs passing the filter will be written "+
+ "into the output file. Filter condition is a logical combination (using parentheses for grouping, "+
+ "& for AND, | for OR, = for specifying values) of the primitives listed below. Primitives that end with "+
+ " 1 or 2 apply specifically to pair end 1 or 2, respectively; when, in addition to primitives
1 and
2, " +
+ " a primitive
is also defined, it is always interpreted as
1 &
2. Primitives: PAIRSONLY "+
+ "(print only pairs=both ends mapped), MINQ1=, MINQ2=, MINQ= (minimum alignment quality if read is mapped) "+
+ " for MINQ1= & MINQ2=), ERRDIFF1=, ERRDIFF2= " +
+ "(when next-best alignment(s) are available",
+ optional = true)
+ public String FILTER = null;
+ @Option(shortName="Q", optional=true, doc="Minimum mapping quality required on both ends in order to accept the pair.")
+ public Integer MINQ = 20;
+
+ public static int INFINITY = 1000000000;
+
+ // we will collect some stats along the way:
+ private int fragments_seen = 0;
+ private int end1_missing = 0;
+ private int end2_missing = 0;
+ private int end1_unmapped = 0;
+ private int end2_unmapped = 0;
+ private int both_unmapped = 0;
+ private int both_mapped = 0;
+ private int both_unique = 0;
+ private int proper_pair = 0;
+
+ /** Required main method implementation. */
+ public static void main(final String[] argv) {
+ System.exit(new PairMaker().instanceMain(argv));
+ }
+
+ protected int doWork() {
+
+ SAMFileReader end1Reader = new SAMFileReader(IN1);
+ SAMFileReader end2Reader = new SAMFileReader(IN2);
+
+ SAMFileHeader h = checkHeaders(end1Reader.getFileHeader(), end2Reader.getFileHeader() );
+
+ SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(h, true, OUTPUT);
+
+ ParallelSAMIterator pi = new ParallelSAMIterator(end1Reader, end2Reader);
+
+ List end1 ;
+ List end2 ;
+
+ SAMRecord r1 = null, r2 = null; // alignments selected for ends 1 and 2, respectively, out of (possibly) multiple alternative placements
+
+ while ( pi.hasNext() ) {
+
+ fragments_seen++;
+
+ Pair< List, List > ends = pi.next();
+
+ end1 = ends.getFirst();
+ end2 = ends.getSecond();
+
+ if ( end1.size() == 0 ) {
+ // nothing at all for end1, choose best from end2 and save
+ end1_missing++;
+ r1 = null;
+ r2 = selectBestSingleEnd(end2);
+ if ( AlignmentUtils.isReadUnmapped(r2) ) both_unmapped++;
+ else end1_unmapped++;
+ // setPairingInformation(r1,r2);
+ // outWriter.addAlignment(r2);
+ continue;
+ }
+ if ( end2.size() == 0 ) {
+ // nothing at all for end2, choose best from end1 and save
+ end2_missing++;
+ r1 = selectBestSingleEnd(end1);
+ r2 = null;
+ if ( AlignmentUtils.isReadUnmapped(r1) ) both_unmapped++;
+ else end2_unmapped++;
+// setPairingInformation(r1,r2);
+// outWriter.addAlignment(r1);
+ continue;
+ }
+
+ if ( end1.size() == 1 && end2.size() == 1 ) {
+ // unique alignments on both ends: not much to do, just save as a pair
+ r1 = end1.get(0);
+ r2 = end2.get(0);
+ if ( AlignmentUtils.isReadUnmapped(r1) ) {
+ if ( AlignmentUtils.isReadUnmapped(r2) ) both_unmapped++;
+ else end1_unmapped++;
+ } else {
+ if ( AlignmentUtils.isReadUnmapped(r2)) end2_unmapped++;
+ else {
+ // both mapped
+ both_mapped++;
+
+ if ( r1.getMappingQuality() >= MINQ.intValue() &&
+ r2.getMappingQuality() >= MINQ.intValue() ) {
+ both_unique++;
+ if ( r1.getReferenceIndex() == r2.getReferenceIndex() &&
+ orientation(r1,r2) == PairOrientation.INNER ) {
+ proper_pair++;
+ setPairingInformation(r1,r2);
+ outWriter.addAlignment(r1);
+ outWriter.addAlignment(r2);
+ }
+ }
+ }
+ }
+// setPairingInformation(r1,r2);
+// outWriter.addAlignment(r1);
+// outWriter.addAlignment(r2);
+ continue;
+ }
+
+ if ( end1.size() == 1 && AlignmentUtils.isReadUnmapped(end1.get(0)) ) {
+ // special case: multiple alignments for end2 but end1 is unmapped: just select best for end2
+ r1 = end1.get(0);
+ r2 = selectBestSingleEnd(end2);
+ if ( AlignmentUtils.isReadUnmapped(r2) ) both_unmapped++;
+ else end1_unmapped++;
+// setPairingInformation(r1,r2);
+// outWriter.addAlignment(r1);
+// outWriter.addAlignment(r2);
+ continue;
+ }
+
+ if ( end2.size() == 1 && AlignmentUtils.isReadUnmapped(end2.get(0)) ) {
+ // special case: multiple alignments for end1 but end2 is unmapped: just select best for end1
+ r1 = selectBestSingleEnd(end1);
+ r2 = end2.get(0);
+ if ( AlignmentUtils.isReadUnmapped(r1) ) both_unmapped++;
+ else end2_unmapped++;
+ // setPairingInformation(r1,r2);
+ // outWriter.addAlignment(r1);
+ // outWriter.addAlignment(r2);
+ continue;
+ }
+
+ // ok, if we are here then we got both ends and multiple alignments in at least one end.
+ // Let's loop through candidates and choose the best pair:
+/*
+ List good = new ArrayList();
+ List bad = new ArrayList();
+ double best_good = INFINITY;
+
+ for ( SAMRecord candidate1 : end1 ) {
+ for ( SAMRecord candidate2 : end2 ) {
+ if ( candidate1.getReferenceIndex() == candidate2.getReferenceIndex()
+ && orientation(candidate1,candidate2)==PairOrientation.INNER ) {
+ double score = pairingScore(candidate1, candidate2);
+ }
+ }
+ }
+*/
+ r1 = selectUniqueSingleEnd(end1,MINQ.intValue());
+ r2 = selectUniqueSingleEnd(end2,MINQ.intValue());
+ if ( r1 == null || r2 == null ) continue;
+
+ both_mapped++;
+ both_unique++;
+ if ( r1.getReferenceIndex() == r2.getReferenceIndex() &&
+ orientation(r1,r2) == PairOrientation.INNER ) {
+ proper_pair++;
+ setPairingInformation(r1,r2);
+ outWriter.addAlignment(r1);
+ outWriter.addAlignment(r2);
+ }
+
+
+ }
+
+ pi.close();
+ outWriter.close();
+
+ System.out.println("Pairs with end1 missing: "+end1_missing);
+ System.out.println("Pairs with end2 missing: "+end2_missing);
+ System.out.println("Pairs with only end1 unmapped (including missing): "+end1_unmapped);
+ System.out.println("Pairs with only end2 unmapped (including missing): "+end2_unmapped);
+ System.out.println("Pairs with both ends unmapped (including one missing): "+both_unmapped);
+ System.out.println("Pairs with both ends mapped: "+both_mapped);
+ System.out.println("Pairs with both ends mapped uniquely (based on MINQ="+MINQ+"): "+both_unique);
+ System.out.println("Pairs with both ends mapped uniquely and properly (written into output): "+proper_pair);
+ return 0;
+ }
+
+ private Query > parseConditions(String filter) {
+
+ filter = filter.trim();
+ Query> result1, result2;
+
+ int level = 0; // parentheses level
+
+ for ( int i = 0 ; i < filter.length() ; i++ ) {
+ switch ( filter.charAt(i) ) {
+ case '(': level++; break;
+ case ')': level--;
+ if ( level < 0 ) throw new RuntimeException("Too many closing parentheses in the expression.");
+ break;
+ case '&': if ( level > 0 ) break; // parenthised expression - not now!
+ // we are at level 0: parse expressions to the left and to the right of '&' operator
+ return new CompositeQuery >(parseConditions(filter.substring(0,i)),
+ parseConditions(filter.substring(i+1)),
+ Query.Operator.AND);
+ case '|': if ( level > 0 ) break; // inside parenthised expression - keep scanning, we'll process the whole expression later
+ // we are at level 0: parse expressions to the left and to the right of '|' operator
+ return new CompositeQuery >(parseConditions(filter.substring(0,i)),
+ parseConditions(filter.substring(i+1)),
+ Query.Operator.OR);
+ default: break;
+ }
+ }
+
+ if ( level > 0 ) throw new RuntimeException("Too many opening parentheses in the expression.");
+
+ // if we ended up here, this is either a single parenthized expression or a primitive.
+ // filter was trimmed; if it is a parenthized expression, ( and ) should be first/last symbols:
+ if ( filter.charAt(0) == '(' && filter.charAt(filter.length()-1) == ')')
+ return parseConditions(filter.substring(1,filter.length()-1));
+
+ // ok, it's a primitive:
+ int equal_pos = filter.indexOf('=');
+ if ( equal_pos < 0 ) { // it's not a = expression, but a logical primitive
+ if ( "PAIRSONLY".equals(filter) ) return new BothEndsMappedQuery();
+ }
+
+ return null;
+ }
+
+ /**
+ * Utility method: checks if the two headers are the same. Returns the first one if they are,
+ * a non-NULL one if the other one is NULL, or NULL if both headers are NULL. If headers are
+ * both not NULL and are not the same, a RuntimeException is thrown.
+ * @param h1
+ * @param h2
+ * @return true if the headers are the same
+ */
+ private SAMFileHeader checkHeaders(SAMFileHeader h1, SAMFileHeader h2) {
+
+ if ( h1 == null ) return h2;
+ if ( h2 == null ) return h1;
+
+// if ( ! h1.getReadGroups().equals(h2.getReadGroups())) throw new RuntimeException("Read groups in the two input files do not match");
+// if ( ! h1.getSequenceDictionary().equals(h2.getSequenceDictionary()) ) throw new RuntimeException("Sequence dictionaries in the two input files do not match");
+// if ( ! h1.getProgramRecords().equals(h2.getProgramRecords()) ) throw new RuntimeException("Program records in the two input files do not match");
+ if ( ! h1.equals(h2) ) throw new RuntimeException("Headers in the two input files do not match");
+ return h1;
+ }
+
+ /** Given a list of alignments, returns the one with the best mapping quality score.
+ * If there is more than one alignment with the same score (got to be 0), one of
+ * these best-scoring alignments will be returned at random.
+ * @param l
+ * @return
+ */
+ private SAMRecord selectBestSingleEnd(List l) {
+ if ( l.size() == 0 ) return null; // should not happen; just don not want to crash here, but somewhere else
+ if ( l.size() == 1 ) return l.get(0); // not much choice...
+ int best_qual = -1;
+ int n_unmapped = 0;
+ List best = new ArrayList();
+
+ for ( SAMRecord r : l ) {
+ if ( r.getReadUnmappedFlag() ) {
+ // paranoid; if there are ANY alignments available, there should not be any "unmapped" records among them;
+ // and if the read is "unmapped" indeed, then there should be no other alignments reported
+ n_unmapped++;
+ continue;
+ }
+ if ( r.getMappingQuality() > best_qual) {
+ best_qual = r.getMappingQuality();
+ best.clear();
+ best.add(r);
+ continue;
+ }
+ if ( r.getMappingQuality() == best_qual ) best.add(r);
+ }
+ if ( best.size() == 0 ) throw new RuntimeException("Currently Unsupported: SAM file might be not fully compliant. "+
+ "Multiple 'unmapped' records found for read "+l.get(0).getReadName());
+ if ( best.size() == 1 ) return best.get(0);
+ if ( best_qual != 0 ) throw new RuntimeException("Multiple alignments for the same read found with non-zero score. "+
+ "Read: "+l.get(0).getReadName()+" best score: "+best_qual);
+ return best.get((int)(Math.random()*best.size()));
+ }
+
+ private SAMRecord selectUniqueSingleEnd(List l, int minq) {
+ if ( l.size() == 0 ) return null; // should not happen; just don not want to crash here, but somewhere else
+ if ( l.size() == 1 ) {
+ if ( l.get(0).getMappingQuality() >= minq ) return l.get(0);
+ else return null; // not unique enough
+ }
+
+ int n_unmapped = 0;
+ List best = new ArrayList();
+
+ for ( SAMRecord r : l ) {
+ if ( AlignmentUtils.isReadUnmapped(r) ) {
+ // paranoid; if there are ANY alignments available, there should not be any "unmapped" records among them;
+ // and if the read is "unmapped" indeed, then there should be no other alignments reported
+ n_unmapped++;
+ continue;
+ }
+ if ( r.getMappingQuality() >= minq ) {
+ best.add(r);
+ continue;
+ }
+ }
+ if ( best.size() == 0 ) return null; // no unique alignment
+ if ( best.size() > 1 ) throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
+
+ return best.get(0);
+ }
+
+ /**
+ * Assumes that alignments r1 and r2 are the two ends of a selected mate pair, and sets pairing flags and reciprocal mate
+ * mapping values for each of them (so that e.g. r2.getMateAlignmentStart() is properly set to
+ * r1.getAlignmentStart(), etc). Any one of the two alignments can be null, in which case it is assumed
+ * that the corresponding end is unmapped, and the flags/values in the other end will be set accordingly.
+ * If both r1 and r2 are null, an exception will be thrown.
+ * @param r1 first end in a mate pair
+ * @param r2 second end in a mate pair
+ */
+ private void setPairingInformation(SAMRecord r1, SAMRecord r2) {
+ // set mate information (note that r1 and r2 can not be 'null' simultaneously):
+
+ if ( r1 == null && r2 == null ) throw new RuntimeException("Both ends of the mate pair are passed as 'null'");
+
+ if ( r1 != null ) {
+ r1.setMateUnmappedFlag( r2==null ? true : AlignmentUtils.isReadUnmapped(r2) );
+ r1.setMateReferenceIndex( r2==null ? SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX : r2.getReferenceIndex() );
+ r1.setMateNegativeStrandFlag(r2==null ? false : r2.getReadNegativeStrandFlag() );
+ r1.setMateAlignmentStart( r2 == null ? SAMRecord.NO_ALIGNMENT_START : r2.getAlignmentStart());
+
+ r1.setFirstOfPairFlag(true);
+ r1.setSecondOfPairFlag(false);
+ r1.setReadPairedFlag(true);
+ }
+ if ( r2 != null ) {
+ r2.setMateUnmappedFlag( r1==null ? true : AlignmentUtils.isReadUnmapped(r1) );
+ r2.setMateReferenceIndex( r1==null ? SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX : r1.getReferenceIndex() );
+ r2.setMateNegativeStrandFlag(r1==null ? false : r1.getReadNegativeStrandFlag() );
+ r2.setMateAlignmentStart( r1 == null ? SAMRecord.NO_ALIGNMENT_START : r1.getAlignmentStart());
+ r2.setFirstOfPairFlag(false);
+ r2.setSecondOfPairFlag(true);
+ r2.setReadPairedFlag(true);
+ }
+ }
+
+ /**
+ * Returns fragment length inferred from the two alignments, or 1,000,000,000 if reads align
+ * on different chromosomes or are not mapped at all. NOTE: the returned fragment length is
+ * start+read length of the rightmost alignment minus start of the leftmost one; it makes
+ * sense only for "proper" pairs (leftmost forward, rightmost reverse); for all other pairs
+ * the returned number does not allow for any meaningful interpretation.
+ * @param r1
+ * @param r2
+ * @return
+ */
+ private int fragmentSize(final SAMRecord r1, final SAMRecord r2) {
+ if ( r1 == null || AlignmentUtils.isReadUnmapped(r1) ||
+ r2 == null || AlignmentUtils.isReadUnmapped(r2) ||
+ r1.getReferenceIndex() != r2.getReferenceIndex() ) return INFINITY;
+ if ( r1.getAlignmentStart() <= r2.getAlignmentStart() )
+ return ( r2.getAlignmentStart() + r2.getReadLength() - r1.getAlignmentStart());
+ else return ( r1.getAlignmentStart() + r1.getReadLength() - r2.getAlignmentStart());
+ }
+
+ enum PairOrientation {
+ INNER, OUTER, LEFT, RIGHT, NONE
+ }
+
+ /**
+ * Returns orientation of the pair: INNER for "--> <--", OUTER for "<-- -->",
+ * LEFT for "<-- <--" and RIGHT for "--> -->" (regardless of which read in a pair, 1 or 2,
+ * actually maps to the left and which to the right). If any of the reads is null or unmapped, returns NONE.
+ * If reads are on different contigs, they are still ordered according to the underlying contig order (by reference
+ * index), and the returned value reflects their relative orientation as described above (however it does not seem
+ * to be very useful in that case).
+ * @param r1
+ * @param r2
+ * @return
+ */
+ private PairOrientation orientation(SAMRecord r1, SAMRecord r2) {
+
+ if ( r1 == null || r2 == null || AlignmentUtils.isReadUnmapped(r1) || AlignmentUtils.isReadUnmapped(r2))
+ return PairOrientation.NONE;
+
+ SAMRecord left, right;
+
+ if ( r1.getReferenceIndex() == r2.getReferenceIndex() ) {
+ if ( r1.getAlignmentStart() <= r2.getAlignmentStart() ) {
+ left = r1;
+ right = r2;
+ } else {
+ left = r2;
+ right = r1;
+ }
+ } else {
+ if ( r1.getReferenceIndex() < r2.getReferenceIndex() ) {
+ left = r1;
+ right = r2;
+ } else {
+ left = r2;
+ right = r1;
+ }
+ }
+
+ if ( ! left.getReadNegativeStrandFlag() ) { // left is forward
+ if ( right.getReadNegativeStrandFlag() ) return PairOrientation.INNER; // left is forward, right is reverse
+ else return PairOrientation.RIGHT; // left is forward, right is forward
+ } else { // left is reverse
+ if ( right.getReadNegativeStrandFlag() ) return PairOrientation.LEFT; // left is reverse, right is reverse
+ else return PairOrientation.OUTER; // left is reverse, right is forward
+ }
+ }
+
+ class Pairing {
+ SAMRecord r1;
+ SAMRecord r2;
+ double score;
+
+ Pairing() {
+ this(null,null,INFINITY);
+ }
+
+ Pairing(SAMRecord r1, SAMRecord r2) {
+ this(r1,r2,INFINITY);
+ }
+ Pairing(SAMRecord r1, SAMRecord r2, double score) {
+ this.r1 = r1;
+ this.r2 = r2;
+ this.score = score;
+ }
+
+ SAMRecord getFirst() { return r1; }
+ SAMRecord getSecond() { return r2; }
+ double getScore() { return score; }
+
+ void setFirst(SAMRecord r) { r1 = r; }
+ void setSecond(SAMRecord r) { r2 = r; }
+ void setScore(double score) { this.score = score; }
+ }
+
+ private double pairingScore(final SAMRecord r1, final SAMRecord r2) {
+
+ return ( ( r1.getMappingQuality() + r2.getMappingQuality() ) * Math.exp(1)) ;
+ }
+
+ interface Query {
+ boolean isSatisfied(T record) ;
+ enum Operator { OR, AND };
+ }
+
+ class CompositeQuery implements Query {
+ private Query q1;
+ private Query q2;
+ private Query.Operator type ; // 1 for 'and', 0 for 'or'
+
+ CompositeQuery(Query q1, Query q2, Query.Operator type) {
+ this.q1 = q1;
+ this.q2 = q2;
+ this.type = type;
+ }
+
+ public boolean isSatisfied(T record) {
+ switch ( type ) {
+ case AND: return q1.isSatisfied(record) && q2.isSatisfied(record);
+ case OR: return q1.isSatisfied(record) || q2.isSatisfied(record);
+ default: throw new IllegalStateException("Unknown composite query operator");
+ }
+ }
+ }
+
+ class BothEndsMappedQuery implements Query< Pair > {
+ public boolean isSatisfied(Pair p) {
+ return ( ! AlignmentUtils.isReadUnmapped(p.getFirst()) && ! AlignmentUtils.isReadUnmapped(p.getSecond())) ;
+ }
+ }
+
+}