From c776f9fb90ae0776baf3669a0032a9bdda722383 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 2 Dec 2009 22:51:41 +0000 Subject: [PATCH] Simple utilities for dealing with Complete Genomics data git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2230 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/playground/tools/CGUtilities.java | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/tools/CGUtilities.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/CGUtilities.java b/java/src/org/broadinstitute/sting/playground/tools/CGUtilities.java new file mode 100755 index 000000000..dd1d69690 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/tools/CGUtilities.java @@ -0,0 +1,122 @@ +package org.broadinstitute.sting.playground.tools; + +import net.sf.picard.cmdline.CommandLineProgram; +import net.sf.picard.cmdline.Usage; +import net.sf.picard.cmdline.Option; +import net.sf.samtools.*; + +import java.io.*; +import java.util.List; +import java.util.ArrayList; + +public class CGUtilities extends CommandLineProgram { + @Usage(programVersion="1.0") public String USAGE = "Mark, document me"; + @Option(shortName="I", doc="Input file (bam or sam) to mark primary alignments in. Must be sorted by name", + optional=false) public File IN = null; + @Option(shortName="O",doc="Output file (BAM). If not specified, output is printed to stdout.", + optional=false) public File OUT = null; + @Option(shortName="M",doc="Maximum number of reasd to process.", + optional=true) public int MAX_READS = -1; + + public static void main(final String[] argv) { + System.exit(new CGUtilities().instanceMain(argv)); + } + + protected int doWork() { + InputStream ins; + try { + ins = new FileInputStream(IN); + } catch ( FileNotFoundException ie ) { + System.out.println("Failed to open input file "+IN+": "+ie.getCause()); + return 1; + } + + SAMFileReader inReader = new SAMFileReader(ins); + SAMFileHeader header = inReader.getFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUT); + + int i = 0; + String currentReadName = null; + List possibleAlignments = new ArrayList(); + + for ( SAMRecord read : inReader ) { + String readName = read.getReadName(); + //System.out.println("@" + readName); + + if ( currentReadName == null ) + currentReadName = readName; + + if ( ! currentReadName.equals(readName) ) { + List firsts = subsetByPair(possibleAlignments, true); + List seconds = subsetByPair(possibleAlignments, false); + + if ( firsts.size() + seconds.size() != possibleAlignments.size() ) { + throw new RuntimeException(String.format("Dropped reads %d + %d != %d at %s%n", firsts.size(), seconds.size(), possibleAlignments.size(), currentReadName)); + } + + if ( firsts.size() > 0 ) out.addAlignment(selectBestAlignment(firsts)); + if ( seconds.size() > 0 ) out.addAlignment(selectBestAlignment(seconds)); + possibleAlignments.clear(); + currentReadName = readName; + } + + possibleAlignments.add(read); + + //out.addAlignment(read); + if ( i++ > MAX_READS && MAX_READS != -1 ) break; + } + inReader.close(); + out.close(); + + return 0; + } + + protected SAMRecord selectBestAlignment(List possibleAlignments) { + SAMRecord best = null; + for ( SAMRecord possible : possibleAlignments ) { + if ( best == null || isBetterAlignment(best, possible) ) + best = possible; + } + + if ( best != null ) { + best.setNotPrimaryAlignmentFlag(false); // we're the primary alignment! + } + + return best; + } + + protected List subsetByPair(List reads, boolean getFirst) { + List sub = new ArrayList(); + + for ( SAMRecord read : reads ) { + boolean add = read.getFirstOfPairFlag() ? getFirst : ! getFirst; + if ( ! read.getReadPairedFlag() ) + throw new RuntimeException("Read is unpaired! " + read.format()); + if ( add ) sub.add(read); + } + + return sub; + } + + protected boolean isBetterAlignment(SAMRecord champ, SAMRecord contender) { + boolean replace = false; + if ( contender.getMappingQuality() >= champ.getMappingQuality() ) { + if ( contender.getMappingQuality() > champ.getMappingQuality() ) { + replace = true; + } else if ( contender.getReadLength() > champ.getReadLength() ) { + replace = true; + } else { + ; + //System.out.printf("Equivalent reads%n %s%n %s%n", champ.format(), contender.format()); + } + } + + //if ( replace ) + // System.out.printf("Better read %n %s%n %s%n", contender.format(), champ.format()); + + //System.out.printf("Comparing %s vs. %s => contender is better? %s%n", champ, contender, replace); + + return replace; + } +} \ No newline at end of file