From f75ad0dee34ffdedb05ac861cb98bcff89c6d063 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 14 Mar 2011 19:36:56 +0000 Subject: [PATCH] Now in Picard, and released to the public git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5439 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/playground/tools/ReorderSam.java | 191 ------------------ .../playground/tools/ReplaceReadGroups.java | 78 ------- 2 files changed, 269 deletions(-) delete mode 100644 java/src/org/broadinstitute/sting/playground/tools/ReorderSam.java delete mode 100644 java/src/org/broadinstitute/sting/playground/tools/ReplaceReadGroups.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/ReorderSam.java b/java/src/org/broadinstitute/sting/playground/tools/ReorderSam.java deleted file mode 100644 index 535fe2762..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/ReorderSam.java +++ /dev/null @@ -1,191 +0,0 @@ -package org.broadinstitute.sting.playground.tools; - -import net.sf.picard.PicardException; -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Option; -import net.sf.picard.cmdline.Usage; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.samtools.*; - -import java.io.File; -import java.util.HashMap; -import java.util.Map; - -/** - * User: mdepristo - * - * Reorders a SAM/BAM input file according to the order of contigs in a second reference sequence - */ -public class ReorderSam extends CommandLineProgram { - @Usage(programVersion="1.0") public String USAGE = "Reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file, as determined by exact name matching of contigs. Reads mapped to contigs absent in the new reference are dropped"; - @Option(shortName="I", doc="Input file (bam or sam) to extract reads from.", optional=false) - public File IN = null; - @Option(shortName="O",doc="Output file (bam or sam) to write extracted reads to.", optional=false) - public File OUT = null; - @Option(shortName="R", doc="Reference sequence to reorder reads to match", optional=false) - public File REFERENCE = null; - @Option(shortName="S", doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig", optional=true) - public Boolean ALLOW_INCOMPLETE_DICT_CONCORDANCE = false; - @Option(shortName="U", doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", optional=true) - public Boolean ALLOW_CONTIG_LENGTH_DISCORDANCE = false; - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new ReorderSam().instanceMain(argv)); - } - - protected int doWork() { - SAMFileReader inReader = new SAMFileReader(IN); - - ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE); - SAMSequenceDictionary refDict = reference.getSequenceDictionary(); - - if ( refDict == null ) { - System.out.println("No reference sequence dictionary found. Aborting."); - inReader.close(); - System.exit(1); - } - - printDictionary("SAM/BAM file", inReader.getFileHeader().getSequenceDictionary()); - printDictionary("Reference", refDict); - Map newOrder = buildSequenceDictionaryMap(refDict, inReader.getFileHeader().getSequenceDictionary()); - - // has to be after we create the newOrder - SAMFileHeader outHeader = inReader.getFileHeader().clone(); - outHeader.setSequenceDictionary(refDict); - SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, true, OUT) ; - - // - // write the reads in contig order - // - System.out.println("Writing reads..."); - for ( SAMSequenceRecord contig : refDict.getSequences() ) { - SAMRecordIterator it = inReader.query(contig.getSequenceName(), 0, 0, false); - writeReads(outWriter, it, newOrder, contig.getSequenceName()); - } - // don't forget the unmapped reads - writeReads( outWriter, inReader.queryUnmapped(), newOrder, "unmapped" ); - - // cleanup - inReader.close(); - outWriter.close(); - return 0; - } - - /** - * Low-level helper function that returns the new reference index for oldIndex according to the - * ordering map newOrder. Read is provided in case an error occurs, so that an informative message - * can be made. - * - * @param read - * @param oldIndex - * @param newOrder - * @return - */ - private static int newOrderIndex(SAMRecord read, int oldIndex, Map newOrder) { - if ( oldIndex == -1 ) - return -1; // unmapped read - else if ( ! newOrder.containsKey(oldIndex) ) - throw new PicardException("BUG: no mapping found for read " + read.format()); - else - return newOrder.get(oldIndex); - } - - /** - * Helper function that writes reads from iterator it into writer out, updating each SAMRecord along the way - * according to the newOrder mapping from dictionary index -> index. Name is used for printing only. - * - * @param out - * @param it - * @param newOrder - * @param name - */ - private static void writeReads(SAMFileWriter out, SAMRecordIterator it, Map newOrder, String name) { - long counter = 0; - System.out.print(" Processing " + name); - System.out.flush(); - - while ( it.hasNext() ) { - counter++; - SAMRecord read = it.next(); - read.setHeader(out.getFileHeader()); - //System.out.println("Writing read " + read.format()); - - int oldRefIndex = read.getReferenceIndex(); - int newRefIndex = newOrderIndex(read, oldRefIndex, newOrder); - read.setReferenceIndex(newRefIndex); - - int oldMateIndex = read.getMateReferenceIndex(); - int newMateIndex = newOrderIndex(read, oldMateIndex, newOrder); - if ( oldMateIndex != -1 && newMateIndex == -1 ) { // becoming unmapped - read.setMateAlignmentStart(0); - read.setMateUnmappedFlag(true); - } - read.setMateReferenceIndex(newMateIndex); - - out.addAlignment(read); - } - - it.close(); - System.out.println(" => Wrote " + counter + " reads"); - } - - /** - * Constructs a mapping from read sequence records index -> new sequence dictionary index for use in - * reordering the reference index and mate reference index in each read. -1 means unmapped. - * @param refDict - * @param readsDict - * @return - */ - private Map buildSequenceDictionaryMap(SAMSequenceDictionary refDict, SAMSequenceDictionary readsDict) { - Map newOrder = new HashMap(); - - System.out.println("Reordering SAM/BAM file:"); - for ( SAMSequenceRecord refRec : refDict.getSequences() ) { - SAMSequenceRecord readsRec = readsDict.getSequence(refRec.getSequenceName()); - if ( readsRec != null ) { - if ( refRec.getSequenceLength() != readsRec.getSequenceLength() ) { - String msg = String.format("Discordant contig lengths: read %s LN=%d, ref %s LN=%d", - readsRec.getSequenceName(), readsRec.getSequenceLength(), - refRec.getSequenceName(), refRec.getSequenceLength()); - if ( ALLOW_CONTIG_LENGTH_DISCORDANCE ) { - System.err.println("WARN: " + msg); - } else { - throw new PicardException(msg); - } - } - - System.out.printf(" Reordering read contig %s [index=%d] to => ref contig %s [index=%d]%n", - readsRec.getSequenceName(), readsRec.getSequenceIndex(), - refRec.getSequenceName(), refRec.getSequenceIndex()); - newOrder.put(readsRec.getSequenceIndex(), refRec.getSequenceIndex()); - } - } - - for ( SAMSequenceRecord readsRec : readsDict.getSequences() ) { - if ( ! newOrder.containsKey(readsRec.getSequenceIndex()) ) { - if ( ALLOW_INCOMPLETE_DICT_CONCORDANCE ) - newOrder.put(readsRec.getSequenceIndex(), -1); - else - throw new PicardException("New reference sequence does not contain a matching contig for " + readsRec.getSequenceName()); - } - } - - return newOrder; - } - - /** - * Helper function to print out a sequence dictionary - * @param name - * @param dict - */ - private static void printDictionary(String name, SAMSequenceDictionary dict) { - System.out.println(name); - for ( SAMSequenceRecord contig : dict.getSequences() ) { - System.out.printf( " SN=%s LN=%d%n", contig.getSequenceName(), contig.getSequenceLength()); - } - } -} - - diff --git a/java/src/org/broadinstitute/sting/playground/tools/ReplaceReadGroups.java b/java/src/org/broadinstitute/sting/playground/tools/ReplaceReadGroups.java deleted file mode 100644 index efbd0cdb9..000000000 --- a/java/src/org/broadinstitute/sting/playground/tools/ReplaceReadGroups.java +++ /dev/null @@ -1,78 +0,0 @@ -package org.broadinstitute.sting.playground.tools; - -import net.sf.picard.cmdline.CommandLineProgram; -import net.sf.picard.cmdline.Option; -import net.sf.picard.cmdline.Usage; -import net.sf.samtools.*; - -import java.io.File; -import java.util.Arrays; - -/** - * User: mdepristo - * - * Replaces read groups in a BAM file - */ -public class ReplaceReadGroups extends CommandLineProgram { - @Usage(programVersion="1.0") public String USAGE = "Creates a new read group, and assigns all reads from the I BAM file to this read group in the O BAM"; - @Option(shortName="I", doc="Input file (bam or sam).", optional=false) - public File IN = null; - @Option(shortName="O",doc="Output file (bam or sam).", optional=false) - public File OUT = null; - - @Option(shortName="ID",doc="Read Group ID", optional=false) - public String RGID = null; - - @Option(shortName="LB",doc="Read Group Library", optional=false) - public String RGLB = null; - - @Option(shortName="PL",doc="Read Group platform", optional=false) - public String RGPL = null; - - @Option(shortName="SM",doc="Read Group sample", optional=false) - public String RGSM = null; - - private static final String RGFIELD = "RG"; // todo -- use binary tag that's private in picard - - // todo -- is it worth supporting these fields? - // CN Name of sequencing center producing the read. - // DS Description. - // DT Date the run was produced (ISO8601 date or date/time). - // PU Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identi er. - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new ReplaceReadGroups().instanceMain(argv)); - } - - protected int doWork() { - SAMFileReader inReader = new SAMFileReader(IN); - - // create the read group we'll be using - SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); - rg.setLibrary(RGLB); - rg.setPlatform(RGPL); - rg.setSample(RGSM); - System.out.printf("Created read group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample()); - - // create the new header and output file - SAMFileHeader outHeader = inReader.getFileHeader().clone(); - outHeader.setReadGroups(Arrays.asList(rg)); - SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, true, OUT) ; - - // - // write the reads in contig order - // - for ( SAMRecord read : inReader ) { - read.setAttribute(RGFIELD, rg.getId()); - outWriter.addAlignment(read); - } - - // cleanup - inReader.close(); - outWriter.close(); - return 0; - } -} - -