gatk-3.8/java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java

package edu.mit.broad.picard.sam;

import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.metrics.MetricsFile;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.sam.util.SortingCollection;
import edu.mit.broad.sam.*;

import java.io.*;
import java.util.*;

/**
 * A better duplication marking algorithm that handles all cases including clipped
 * and gapped alignments.
 *
 * @author Tim Fennell
 */
public class MarkDuplicates extends CommandLineProgram {
    private static final Log log = Log.getInstance(MarkDuplicates.class);

    @Usage public final String USAGE =
            "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " +
            "All records are then written to the output file with the duplicate records flagged.";
    @Option(shortName="I", doc="The input SAM or BAM file to analyze") public File INPUT;
    @Option(shortName="O", doc="The output file to right marked records to") public File OUTPUT;
    @Option(shortName="M", doc="File to write duplication metrics to") public File METRICS_FILE;

    private SortingCollection<ReadEnds> pairSort;
    private SortingCollection<ReadEnds> fragSort;
    private long[] duplicateIndexes = new long[1000000];
    private int nextIndex = 0; // The next offset into duplicateIndexes to use


    /** Stock main method. */
    public static void main(String[] args) {
        new MarkDuplicates().instanceMain(args);
    }

    /** Little struct-like class to hold read pair (and fragment) end data. */
    private static class ReadEnds {
        public static final int SIZE_OF = (1*1) + (2*1) + (4*4) + (8*2) + 8; // last 8 == reference overhead
        public static final byte F=0, R=1, FF=2, FR=3, RR=4, RF=5;

        short score = 0;
        byte orientation;
        int read1Sequence     = -1;
        int read1Coordinate   = -1;
        long read1IndexInFile = -1;
        int read2Sequence     = -1;
        int read2Coordinate   = -1;
        long read2IndexInFile = -1;

        boolean isPaired() { return this.read2Sequence != -1; }
    }

    /** Comparator for ReadEnds that orders by read1 position then pair orientation then read2 position. */
    private static class ReadEndsComparator implements Comparator<ReadEnds> {
        public int compare(ReadEnds lhs, ReadEnds rhs) {
            int retval = lhs.read1Sequence - rhs.read1Sequence;
            if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate;
            if (retval == 0) retval = lhs.orientation - rhs.orientation;
            if (retval == 0) retval = lhs.read2Sequence   - rhs.read2Sequence;
            if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate;
            if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile);
            if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile);

            return retval;
        }
    }

    /** Coded for ReadEnds that just outputs the primitive fields and reads them back. */
    private static class ReadEndsCodec implements SortingCollection.Codec<ReadEnds> {
        private DataInputStream in;
        private DataOutputStream out;

        public SortingCollection.Codec<ReadEnds> clone() {
            return new ReadEndsCodec();
        }

        public void setOutputStream(OutputStream os) { this.out = new DataOutputStream(os); }
        public void setInputStream(InputStream is) { this.in = new DataInputStream(is); }

        public void encode(ReadEnds read) {
            try {
                this.out.writeShort(read.score);
                this.out.writeByte(read.orientation);
                this.out.writeInt(read.read1Sequence);
                this.out.writeInt(read.read1Coordinate);
                this.out.writeLong(read.read1IndexInFile);
                this.out.writeInt(read.read2Sequence);

                if (read.orientation > ReadEnds.R) {
                    this.out.writeInt(read.read2Coordinate);
                    this.out.writeLong(read.read2IndexInFile);
                }
                this.out.flush();
            }
            catch (IOException ioe) {
                throw new PicardException("Exception writing ReadEnds to file.", ioe);
            }
        }

        public ReadEnds decode() {
            ReadEnds read = new ReadEnds();
            try {
                // If the first read results in an EOF we've exhausted the stream
                try { read.score = this.in.readShort(); }
                catch (EOFException eof) { return null; }

                read.orientation      = this.in.readByte();
                read.read1Sequence    = this.in.readInt();
                read.read1Coordinate  = this.in.readInt();
                read.read1IndexInFile = this.in.readLong();
                read.read2Sequence    = this.in.readInt();

                if (read.orientation > ReadEnds.R) {
                    read.read2Coordinate  = this.in.readInt();
                    read.read2IndexInFile = this.in.readLong();
                }
                return read;
            }
            catch (IOException ioe) {
                throw new PicardException("Exception writing ReadEnds to file.", ioe);
            }
        }
    }

    /**
     * Main work method.  Reads the BAM file once and collects sorted information about
     * the 5' ends of both ends of each read (or just one end in the case of pairs).
     * Then makes a pass through those determining duplicates before re-reading the
     * input file and writing it out with duplication flags set correctly.
     */
    protected int doWork() {
        log.info("Reading input file and constructing read end information.");
        buildSortedReadEndLists();
        generateDuplicateIndexes();
        log.info("Marking " + this.duplicateIndexes.length + " records as duplicates.");
        DuplicationMetrics metrics = new DuplicationMetrics();
        SAMFileReader in  = new SAMFileReader(INPUT);
        SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(),
                                                                          true,
                                                                          OUTPUT);

        // Now copy over the file while marking all the necessary indexes as duplicates
        long recordInFileIndex = 0;
        long nextDuplicateIndex = (this.duplicateIndexes.length == 0 ? -1 :  this.duplicateIndexes[0]);
        int  arrayIndex = 1;

        for (SAMRecord rec : in) {
            // First bring the simple metrics up to date
            if (rec.getReadUnmappedFlag()) {
                ++metrics.UNMAPPED_READS;
            }
            else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
                ++metrics.UNPAIRED_READS_EXAMINED;
            }
            else if (rec.getFirstOfPairFlag()){
                ++metrics.READ_PAIRS_EXAMINED;
            }


            if (recordInFileIndex++ == nextDuplicateIndex) {
                rec.setDuplicateReadFlag(true);

                // Update the duplication metrics
                if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
                    ++metrics.UNPAIRED_READ_DUPLICATES;
                }
                else if (rec.getFirstOfPairFlag()) {
                    ++metrics.READ_PAIR_DUPLICATES;
                }

                // Now try and figure out the next duplicate index
                try {
                    nextDuplicateIndex = this.duplicateIndexes[arrayIndex++];
                }
                catch (ArrayIndexOutOfBoundsException e) {
                    // Only happens once we've marked all the duplicates
                    nextDuplicateIndex = -1;
                    arrayIndex = -1;
                }
            }

            out.addAlignment(rec);
        }

        out.close();


        // Write out the metrics
        metrics.calculateDerivedMetrics();
        MetricsFile<DuplicationMetrics,Double> file = getMetricsFile();
        file.addMetric(metrics);
        file.setHistogram(metrics.calculateRoiHistogram());
        file.write(METRICS_FILE);        

        return 0;
    }

    /**
     * Goes through all the records in a file and generates a set of ReadEnds objects that
     * hold the necessary information (reference sequence, 5' read coordinate) to do
     * duplication, caching to disk as necssary to sort them.
     */
    private void buildSortedReadEndLists() {
        // TODO: take into account clipping/padding?
        int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * 0.25) / ReadEnds.SIZE_OF);
        this.pairSort = SortingCollection.newInstance(ReadEnds.class,
                                                      new ReadEndsCodec(),
                                                      new ReadEndsComparator(),
                                                      maxInMemory);

        this.fragSort = SortingCollection.newInstance(ReadEnds.class,
                                                      new ReadEndsCodec(),
                                                      new ReadEndsComparator(),
                                                      maxInMemory);

        Map<String, ReadEnds> tmp = new HashMap<String, ReadEnds>();
        SAMFileReader sam = new SAMFileReader(INPUT);
        SAMFileHeader header = sam.getFileHeader();
        long index = 0;

        for (SAMRecord rec : sam) {
            if (rec.getReadUnmappedFlag()) {
                continue;
            }

            ReadEnds fragmentEnd = buildReadEnds(header, index, rec);
            this.fragSort.add(fragmentEnd);

            if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
                String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName();
                ReadEnds pairedEnds = tmp.remove(key);

                // See if we've already seen the first end or not
                if (pairedEnds == null) {
                    pairedEnds = buildReadEnds(header, index, rec);
                    tmp.put(key, pairedEnds);
                }
                else {
                    int sequence = fragmentEnd.read1Sequence;
                    int coordinate = fragmentEnd.read1Coordinate;

                    // If the second read is actually later, just add the second read data, else flip the reads
                    if (sequence > pairedEnds.read1Sequence || (sequence == pairedEnds.read1Sequence && coordinate >= pairedEnds.read1Coordinate)) {
                        pairedEnds.read2Sequence    = sequence;
                        pairedEnds.read2Coordinate  = coordinate;
                        pairedEnds.read2IndexInFile = index;
                        pairedEnds.orientation = getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
                    }
                    else {
                        pairedEnds.read2Sequence    = pairedEnds.read1Sequence;
                        pairedEnds.read2Coordinate  = pairedEnds.read1Coordinate;
                        pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
                        pairedEnds.read1Sequence    = sequence;
                        pairedEnds.read1Coordinate  = coordinate;
                        pairedEnds.read1IndexInFile = index;
                        pairedEnds.orientation = getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
                    }

                    pairedEnds.score += getScore(rec);
                    this.pairSort.add(pairedEnds);
                }
            }

            ++index;
        }
    }

    /** Builds a read ends object that represents a single read. */
    private ReadEnds buildReadEnds(SAMFileHeader header, long index, SAMRecord rec) {
        ReadEnds ends = new ReadEnds();
        ends.read1Sequence    = rec.getReferenceIndex(header);
        ends.read1Coordinate  = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart();
        ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F;
        ends.read1IndexInFile = index;
        ends.score = getScore(rec);

        // Doing this lets the ends object know that it's part of a pair
        if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
            ends.read2Sequence = rec.getMateReferenceIndex(header);
        }

        return ends;
    }

    /**
     * Returns a single byte that encodes the orientation of the two reads in a pair.
     */
    private byte getOrientationByte(boolean read1NegativeStrand, boolean read2NegativeStrand) {
        if (read1NegativeStrand) {
            if (read2NegativeStrand)  return ReadEnds.RR;
            else return ReadEnds.RF;
        }
        else {
            if (read2NegativeStrand)  return ReadEnds.FR;
            else return ReadEnds.FF;
        }
    }


    /** Calculates a score for the read which is the sum of scores over Q20. */
    private short getScore(SAMRecord rec) {
        short score = 0;
        for (byte b : rec.getBaseQualities()) {
            if (b >= 15) score += b;
        }

        return score;
    }

    /**
     * Goes through the accumulated ReadEnds objects and determines which of them are
     * to be marked as duplicates.
     *
     * @return an array with an ordered list of indexes into the source file
     */
    private void generateDuplicateIndexes() {
        ReadEnds firstOfNextChunk = null;
        List<ReadEnds> nextChunk  = new ArrayList<ReadEnds>(200);

        // First just do the pairs
        log.info("Traversing read pair information and detecting duplicates.");
        for (ReadEnds next : this.pairSort) {
            if (firstOfNextChunk == null) {
                firstOfNextChunk = next;
                nextChunk.add(firstOfNextChunk);
            }
            else if (areComparableForDuplicates(firstOfNextChunk, next, true)) {
                nextChunk.add(next);
            }
            else {
                if (nextChunk.size() > 1) {
                    markDuplicatePairs(nextChunk);
                }

                nextChunk.clear();
                nextChunk.add(next);
                firstOfNextChunk = next;
            }
        }
        markDuplicatePairs(nextChunk);
        this.pairSort = null;

        // Now deal with the fragments
        log.info("Traversing fragment information and detecting duplicates.");
        boolean containsPairs = false;
        boolean containsFrags = false;

        for (ReadEnds next : this.fragSort) {
            if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) {
                nextChunk.add(next);
                containsPairs = containsPairs || next.isPaired();
                containsFrags = containsFrags || !next.isPaired();
            }
            else {
                if (nextChunk.size() > 1 && containsFrags) {
                    markDuplicateFragments(nextChunk, containsPairs);
                }

                nextChunk.clear();
                nextChunk.add(next);
                firstOfNextChunk = next;
                containsPairs = next.isPaired();
                containsFrags = !next.isPaired();
            }
        }
        markDuplicateFragments(nextChunk, containsPairs);
        this.fragSort = null;

        // Now shrink down the array and sort it
        log.info("Sorting list of duplicate records.");
        long[] tmp = new long[this.nextIndex];
        System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex);
        this.duplicateIndexes = tmp;
        Arrays.sort(this.duplicateIndexes);
    }

    private boolean areComparableForDuplicates(final ReadEnds lhs, final ReadEnds rhs, final boolean compareRead2) {
        boolean retval =  lhs.read1Sequence   == rhs.read1Sequence &&
                          lhs.read1Coordinate == rhs.read1Coordinate &&
                          lhs.orientation     == rhs.orientation;

        if (compareRead2) {
            retval = lhs.read2Sequence   == rhs.read2Sequence &&
                     lhs.read2Coordinate == rhs.read2Coordinate;
        }

        return retval;
    }

    private void addIndexAsDuplicate(final long bamIndex) {
        if (this.nextIndex > this.duplicateIndexes.length - 1) {
            long[] tmp = new long[this.duplicateIndexes.length * 2];
            System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex);
            this.duplicateIndexes = tmp;
        }

        this.duplicateIndexes[this.nextIndex++] = bamIndex;
    }

    /**
     * Takes a list of ReadEnds objects and removes from it all objects that should
     * not be marked as duplicates.
     *
     * @param list
     */
    private void markDuplicatePairs(final List<ReadEnds> list) {
        short maxScore = 0;
        ReadEnds best = null;

        for (final ReadEnds end : list) {
            if (end.score > maxScore || best == null) {
                maxScore = end.score;
                best = end;
            }
        }

        for (final ReadEnds end : list) {
            if (end != best) {
                addIndexAsDuplicate(end.read1IndexInFile);
                addIndexAsDuplicate(end.read2IndexInFile);
            }
        }
    }

    /**
     * Takes a list of ReadEnds objects and removes from it all objects that should
     * not be marked as duplicates.
     *
     * @param list
     */
    private void markDuplicateFragments(final List<ReadEnds> list, final boolean containsPairs) {
        if (containsPairs) {
            for (final ReadEnds end : list) {
                if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile);
            }
        }
        else {
            short maxScore = 0;
            ReadEnds best = null;
            for (final ReadEnds end : list) {
                if (end.score > maxScore || best == null) {
                    maxScore = end.score;
                    best = end;
                }
            }

            for (final ReadEnds end : list) {
                if (end != best) {
                    addIndexAsDuplicate(end.read1IndexInFile);
                }
            }
        }
    }
}
Initial Analysis toolkit git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4 348d0f76-0448-11de-a6fe-93d51630548a 2009-02-27 05:50:29 +08:00			`package edu.mit.broad.picard.sam;`

			`import edu.mit.broad.picard.cmdline.CommandLineProgram;`
			`import edu.mit.broad.picard.cmdline.Option;`
			`import edu.mit.broad.picard.cmdline.Usage;`
			`import edu.mit.broad.picard.PicardException;`
			`import edu.mit.broad.picard.metrics.MetricsFile;`
			`import edu.mit.broad.picard.util.Log;`
			`import edu.mit.broad.sam.util.SortingCollection;`
			`import edu.mit.broad.sam.*;`

			`import java.io.*;`
			`import java.util.*;`

			`/**`
			`* A better duplication marking algorithm that handles all cases including clipped`
			`* and gapped alignments.`
			`*`
			`* @author Tim Fennell`
			`*/`
			`public class MarkDuplicates extends CommandLineProgram {`
			`private static final Log log = Log.getInstance(MarkDuplicates.class);`

			`@Usage public final String USAGE =`
			`"Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " +`
			`"All records are then written to the output file with the duplicate records flagged.";`
			`@Option(shortName="I", doc="The input SAM or BAM file to analyze") public File INPUT;`
			`@Option(shortName="O", doc="The output file to right marked records to") public File OUTPUT;`
			`@Option(shortName="M", doc="File to write duplication metrics to") public File METRICS_FILE;`

			`private SortingCollection<ReadEnds> pairSort;`
			`private SortingCollection<ReadEnds> fragSort;`
			`private long[] duplicateIndexes = new long[1000000];`
			`private int nextIndex = 0; // The next offset into duplicateIndexes to use`


			`/** Stock main method. */`
			`public static void main(String[] args) {`
			`new MarkDuplicates().instanceMain(args);`
			`}`

			`/** Little struct-like class to hold read pair (and fragment) end data. */`
			`private static class ReadEnds {`
			`public static final int SIZE_OF = (11) + (21) + (44) + (82) + 8; // last 8 == reference overhead`
			`public static final byte F=0, R=1, FF=2, FR=3, RR=4, RF=5;`

			`short score = 0;`
			`byte orientation;`
			`int read1Sequence = -1;`
			`int read1Coordinate = -1;`
			`long read1IndexInFile = -1;`
			`int read2Sequence = -1;`
			`int read2Coordinate = -1;`
			`long read2IndexInFile = -1;`

			`boolean isPaired() { return this.read2Sequence != -1; }`
			`}`

			`/** Comparator for ReadEnds that orders by read1 position then pair orientation then read2 position. */`
			`private static class ReadEndsComparator implements Comparator<ReadEnds> {`
			`public int compare(ReadEnds lhs, ReadEnds rhs) {`
			`int retval = lhs.read1Sequence - rhs.read1Sequence;`
			`if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate;`
			`if (retval == 0) retval = lhs.orientation - rhs.orientation;`
			`if (retval == 0) retval = lhs.read2Sequence - rhs.read2Sequence;`
			`if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate;`
			`if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile);`
			`if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile);`

			`return retval;`
			`}`
			`}`

			`/** Coded for ReadEnds that just outputs the primitive fields and reads them back. */`
			`private static class ReadEndsCodec implements SortingCollection.Codec<ReadEnds> {`
			`private DataInputStream in;`
			`private DataOutputStream out;`

			`public SortingCollection.Codec<ReadEnds> clone() {`
			`return new ReadEndsCodec();`
			`}`

			`public void setOutputStream(OutputStream os) { this.out = new DataOutputStream(os); }`
			`public void setInputStream(InputStream is) { this.in = new DataInputStream(is); }`

			`public void encode(ReadEnds read) {`
			`try {`
			`this.out.writeShort(read.score);`
			`this.out.writeByte(read.orientation);`
			`this.out.writeInt(read.read1Sequence);`
			`this.out.writeInt(read.read1Coordinate);`
			`this.out.writeLong(read.read1IndexInFile);`
			`this.out.writeInt(read.read2Sequence);`

			`if (read.orientation > ReadEnds.R) {`
			`this.out.writeInt(read.read2Coordinate);`
			`this.out.writeLong(read.read2IndexInFile);`
			`}`
			`this.out.flush();`
			`}`
			`catch (IOException ioe) {`
			`throw new PicardException("Exception writing ReadEnds to file.", ioe);`
			`}`
			`}`

			`public ReadEnds decode() {`
			`ReadEnds read = new ReadEnds();`
			`try {`
			`// If the first read results in an EOF we've exhausted the stream`
			`try { read.score = this.in.readShort(); }`
			`catch (EOFException eof) { return null; }`

			`read.orientation = this.in.readByte();`
			`read.read1Sequence = this.in.readInt();`
			`read.read1Coordinate = this.in.readInt();`
			`read.read1IndexInFile = this.in.readLong();`
			`read.read2Sequence = this.in.readInt();`

			`if (read.orientation > ReadEnds.R) {`
			`read.read2Coordinate = this.in.readInt();`
			`read.read2IndexInFile = this.in.readLong();`
			`}`
			`return read;`
			`}`
			`catch (IOException ioe) {`
			`throw new PicardException("Exception writing ReadEnds to file.", ioe);`
			`}`
			`}`
			`}`

			`/**`
			`* Main work method. Reads the BAM file once and collects sorted information about`
			`* the 5' ends of both ends of each read (or just one end in the case of pairs).`
			`* Then makes a pass through those determining duplicates before re-reading the`
			`* input file and writing it out with duplication flags set correctly.`
			`*/`
			`protected int doWork() {`
			`log.info("Reading input file and constructing read end information.");`
			`buildSortedReadEndLists();`
			`generateDuplicateIndexes();`
			`log.info("Marking " + this.duplicateIndexes.length + " records as duplicates.");`
			`DuplicationMetrics metrics = new DuplicationMetrics();`
			`SAMFileReader in = new SAMFileReader(INPUT);`
			`SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(),`
			`true,`
			`OUTPUT);`

			`// Now copy over the file while marking all the necessary indexes as duplicates`
			`long recordInFileIndex = 0;`
			`long nextDuplicateIndex = (this.duplicateIndexes.length == 0 ? -1 : this.duplicateIndexes[0]);`
			`int arrayIndex = 1;`

			`for (SAMRecord rec : in) {`
			`// First bring the simple metrics up to date`
			`if (rec.getReadUnmappedFlag()) {`
			`++metrics.UNMAPPED_READS;`
			`}`
			`else if (!rec.getReadPairedFlag() \|\| rec.getMateUnmappedFlag()) {`
			`++metrics.UNPAIRED_READS_EXAMINED;`
			`}`
			`else if (rec.getFirstOfPairFlag()){`
			`++metrics.READ_PAIRS_EXAMINED;`
			`}`


			`if (recordInFileIndex++ == nextDuplicateIndex) {`
			`rec.setDuplicateReadFlag(true);`

			`// Update the duplication metrics`
			`if (!rec.getReadPairedFlag() \|\| rec.getMateUnmappedFlag()) {`
			`++metrics.UNPAIRED_READ_DUPLICATES;`
			`}`
			`else if (rec.getFirstOfPairFlag()) {`
			`++metrics.READ_PAIR_DUPLICATES;`
			`}`

			`// Now try and figure out the next duplicate index`
			`try {`
			`nextDuplicateIndex = this.duplicateIndexes[arrayIndex++];`
			`}`
			`catch (ArrayIndexOutOfBoundsException e) {`
			`// Only happens once we've marked all the duplicates`
			`nextDuplicateIndex = -1;`
			`arrayIndex = -1;`
			`}`
			`}`

			`out.addAlignment(rec);`
			`}`

			`out.close();`


			`// Write out the metrics`
			`metrics.calculateDerivedMetrics();`
			`MetricsFile<DuplicationMetrics,Double> file = getMetricsFile();`
			`file.addMetric(metrics);`
			`file.setHistogram(metrics.calculateRoiHistogram());`
			`file.write(METRICS_FILE);`

			`return 0;`
			`}`

			`/**`
			`* Goes through all the records in a file and generates a set of ReadEnds objects that`
			`* hold the necessary information (reference sequence, 5' read coordinate) to do`
			`* duplication, caching to disk as necssary to sort them.`
			`*/`
			`private void buildSortedReadEndLists() {`
			`// TODO: take into account clipping/padding?`
			`int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * 0.25) / ReadEnds.SIZE_OF);`
			`this.pairSort = SortingCollection.newInstance(ReadEnds.class,`
			`new ReadEndsCodec(),`
			`new ReadEndsComparator(),`
			`maxInMemory);`

			`this.fragSort = SortingCollection.newInstance(ReadEnds.class,`
			`new ReadEndsCodec(),`
			`new ReadEndsComparator(),`
			`maxInMemory);`

			`Map<String, ReadEnds> tmp = new HashMap<String, ReadEnds>();`
			`SAMFileReader sam = new SAMFileReader(INPUT);`
			`SAMFileHeader header = sam.getFileHeader();`
			`long index = 0;`

			`for (SAMRecord rec : sam) {`
			`if (rec.getReadUnmappedFlag()) {`
			`continue;`
			`}`

			`ReadEnds fragmentEnd = buildReadEnds(header, index, rec);`
			`this.fragSort.add(fragmentEnd);`

			`if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {`
			`String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName();`
			`ReadEnds pairedEnds = tmp.remove(key);`

			`// See if we've already seen the first end or not`
			`if (pairedEnds == null) {`
			`pairedEnds = buildReadEnds(header, index, rec);`
			`tmp.put(key, pairedEnds);`
			`}`
			`else {`
			`int sequence = fragmentEnd.read1Sequence;`
			`int coordinate = fragmentEnd.read1Coordinate;`

			`// If the second read is actually later, just add the second read data, else flip the reads`
			`if (sequence > pairedEnds.read1Sequence \|\| (sequence == pairedEnds.read1Sequence && coordinate >= pairedEnds.read1Coordinate)) {`
			`pairedEnds.read2Sequence = sequence;`
			`pairedEnds.read2Coordinate = coordinate;`
			`pairedEnds.read2IndexInFile = index;`
			`pairedEnds.orientation = getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());`
			`}`
			`else {`
			`pairedEnds.read2Sequence = pairedEnds.read1Sequence;`
			`pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;`
			`pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;`
			`pairedEnds.read1Sequence = sequence;`
			`pairedEnds.read1Coordinate = coordinate;`
			`pairedEnds.read1IndexInFile = index;`
			`pairedEnds.orientation = getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);`
			`}`

			`pairedEnds.score += getScore(rec);`
			`this.pairSort.add(pairedEnds);`
			`}`
			`}`

			`++index;`
			`}`
			`}`

			`/** Builds a read ends object that represents a single read. */`
			`private ReadEnds buildReadEnds(SAMFileHeader header, long index, SAMRecord rec) {`
			`ReadEnds ends = new ReadEnds();`
			`ends.read1Sequence = rec.getReferenceIndex(header);`
			`ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart();`
			`ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F;`
			`ends.read1IndexInFile = index;`
			`ends.score = getScore(rec);`

			`// Doing this lets the ends object know that it's part of a pair`
			`if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {`
			`ends.read2Sequence = rec.getMateReferenceIndex(header);`
			`}`

			`return ends;`
			`}`

			`/**`
			`* Returns a single byte that encodes the orientation of the two reads in a pair.`
			`*/`
			`private byte getOrientationByte(boolean read1NegativeStrand, boolean read2NegativeStrand) {`
			`if (read1NegativeStrand) {`
			`if (read2NegativeStrand) return ReadEnds.RR;`
			`else return ReadEnds.RF;`
			`}`
			`else {`
			`if (read2NegativeStrand) return ReadEnds.FR;`
			`else return ReadEnds.FF;`
			`}`
			`}`



			`/** Calculates a score for the read which is the sum of scores over Q20. */`
			`private short getScore(SAMRecord rec) {`
			`short score = 0;`
			`for (byte b : rec.getBaseQualities()) {`
			`if (b >= 15) score += b;`
			`}`

			`return score;`
			`}`

			`/**`
			`* Goes through the accumulated ReadEnds objects and determines which of them are`
			`* to be marked as duplicates.`
			`*`
			`* @return an array with an ordered list of indexes into the source file`
			`*/`
			`private void generateDuplicateIndexes() {`
			`ReadEnds firstOfNextChunk = null;`
			`List<ReadEnds> nextChunk = new ArrayList<ReadEnds>(200);`

			`// First just do the pairs`
			`log.info("Traversing read pair information and detecting duplicates.");`
			`for (ReadEnds next : this.pairSort) {`
			`if (firstOfNextChunk == null) {`
			`firstOfNextChunk = next;`
			`nextChunk.add(firstOfNextChunk);`
			`}`
			`else if (areComparableForDuplicates(firstOfNextChunk, next, true)) {`
			`nextChunk.add(next);`
			`}`
			`else {`
			`if (nextChunk.size() > 1) {`
			`markDuplicatePairs(nextChunk);`
			`}`

			`nextChunk.clear();`
			`nextChunk.add(next);`
			`firstOfNextChunk = next;`
			`}`
			`}`
			`markDuplicatePairs(nextChunk);`
			`this.pairSort = null;`

			`// Now deal with the fragments`
			`log.info("Traversing fragment information and detecting duplicates.");`
			`boolean containsPairs = false;`
			`boolean containsFrags = false;`

			`for (ReadEnds next : this.fragSort) {`
			`if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) {`
			`nextChunk.add(next);`
			`containsPairs = containsPairs \|\| next.isPaired();`
			`containsFrags = containsFrags \|\| !next.isPaired();`
			`}`
			`else {`
			`if (nextChunk.size() > 1 && containsFrags) {`
			`markDuplicateFragments(nextChunk, containsPairs);`
			`}`

			`nextChunk.clear();`
			`nextChunk.add(next);`
			`firstOfNextChunk = next;`
			`containsPairs = next.isPaired();`
			`containsFrags = !next.isPaired();`
			`}`
			`}`
			`markDuplicateFragments(nextChunk, containsPairs);`
			`this.fragSort = null;`

			`// Now shrink down the array and sort it`
			`log.info("Sorting list of duplicate records.");`
			`long[] tmp = new long[this.nextIndex];`
			`System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex);`
			`this.duplicateIndexes = tmp;`
			`Arrays.sort(this.duplicateIndexes);`
			`}`

			`private boolean areComparableForDuplicates(final ReadEnds lhs, final ReadEnds rhs, final boolean compareRead2) {`
			`boolean retval = lhs.read1Sequence == rhs.read1Sequence &&`
			`lhs.read1Coordinate == rhs.read1Coordinate &&`
			`lhs.orientation == rhs.orientation;`

			`if (compareRead2) {`
			`retval = lhs.read2Sequence == rhs.read2Sequence &&`
			`lhs.read2Coordinate == rhs.read2Coordinate;`
			`}`

			`return retval;`
			`}`

			`private void addIndexAsDuplicate(final long bamIndex) {`
			`if (this.nextIndex > this.duplicateIndexes.length - 1) {`
			`long[] tmp = new long[this.duplicateIndexes.length * 2];`
			`System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex);`
			`this.duplicateIndexes = tmp;`
			`}`

			`this.duplicateIndexes[this.nextIndex++] = bamIndex;`
			`}`

			`/**`
			`* Takes a list of ReadEnds objects and removes from it all objects that should`
			`* not be marked as duplicates.`
			`*`
			`* @param list`
			`*/`
			`private void markDuplicatePairs(final List<ReadEnds> list) {`
			`short maxScore = 0;`
			`ReadEnds best = null;`

			`for (final ReadEnds end : list) {`
			`if (end.score > maxScore \|\| best == null) {`
			`maxScore = end.score;`
			`best = end;`
			`}`
			`}`

			`for (final ReadEnds end : list) {`
			`if (end != best) {`
			`addIndexAsDuplicate(end.read1IndexInFile);`
			`addIndexAsDuplicate(end.read2IndexInFile);`
			`}`
			`}`
			`}`

			`/**`
			`* Takes a list of ReadEnds objects and removes from it all objects that should`
			`* not be marked as duplicates.`
			`*`
			`* @param list`
			`*/`
			`private void markDuplicateFragments(final List<ReadEnds> list, final boolean containsPairs) {`
			`if (containsPairs) {`
			`for (final ReadEnds end : list) {`
			`if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile);`
			`}`
			`}`
			`else {`
			`short maxScore = 0;`
			`ReadEnds best = null;`
			`for (final ReadEnds end : list) {`
			`if (end.score > maxScore \|\| best == null) {`
			`maxScore = end.score;`
			`best = end;`
			`}`
			`}`

			`for (final ReadEnds end : list) {`
			`if (end != best) {`
			`addIndexAsDuplicate(end.read1IndexInFile);`
			`}`
			`}`
			`}`
			`}`
			`}`