Matthew Bainbridge's duplicate removal utility for 454 data. This code should eventually be moved into a read walker. For now, it's being introduced into the repository as-is (well, with one minor change to make the handling of command-line arguments a little more straightforward).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1794 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-10-08 18:32:37 +00:00
parent 8461cc3a22
commit 94d82d1915
1 changed files with 122 additions and 0 deletions

View File

@ -0,0 +1,122 @@
package org.broadinstitute.sting.playground.tools;
import net.sf.samtools.*;
import java.io.*;
import java.util.*;
/**
*
* Marks duplicate reads in a sam or bam file based on start point and strand of the alignment.
* Requires: java samtools "picard". java 1.5 or better.
*
* Note, will not unmark duplicates. The final number of duplicates marked at the end is not necess. the total number of marked duplicates in the final output file.
* bam or sam type is inferred by the number of arguments, read order, header, filetype is preserved in the output file.
*
* Usage: <sam or bam file> [index file if using bam] <output file>
* @author bainbrid
*
*/
public class BCMMarkDupes
{
final static String inHT = "ht";
/**
* Usage: <sam or bam file> [index file if using bam] <output file>
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception
{
int cnt = 0;
int dcnt = 0;
// begin hack by kiran
/*
if(args.length != 2 && args.length !=3 )
{
throw new Exception("Invalid number of arguments.\n"+usage());
}
boolean isBam = args.length == 3;
String inputfile = args[0];
String index = null;
if(isBam) index = args[1];
String outputfile = args[1];
if(isBam) index = args[2];
*/
String inputfile = args[0];
String index = inputfile + ".bai";
String outputfile = args[1];
boolean isBam = true;
// end hack by kiran
//set the reader and writer, and header
SAMFileReader sfr = null;
if(isBam)
sfr = new SAMFileReader(new File(inputfile), new File(index));
else
sfr = new SAMFileReader(new File(inputfile));
Iterator<SAMRecord> iter = sfr.iterator();
SAMFileWriter fw = null;
SAMFileHeader header = sfr.getFileHeader();
if(isBam)
fw = (new SAMFileWriterFactory()).makeBAMWriter(header, true, new File(outputfile));
else
fw = (new SAMFileWriterFactory()).makeSAMWriter(header, true, new File(outputfile));
Hashtable<String,String> ht = new Hashtable<String,String>(2000000);
//iterate through file and mark dupes
while(iter.hasNext())
{
SAMRecord rec = iter.next();
int start = 0;
String strand = "N";
if(rec.getReadNegativeStrandFlag())
{
start = rec.getAlignmentEnd();
//start = rec.getUnclippedEnd();
}
else
{
strand = "P";
start = rec.getAlignmentStart();
//start = rec.getUnclippedStart();
}
if(start == 0) continue;
cnt++;
String chromo = rec.getReferenceName();
String key = chromo+strand+start;
String s = ht.get(key);
if(s == null)
{
ht.put(key, inHT);
//rec.setDuplicateReadFlag(false);
}
else
{
rec.setDuplicateReadFlag(true);
dcnt++;
}
fw.addAlignment(rec);
}
fw.close();
System.err.println("Total records:"+cnt);
System.err.println("Records marked as duplicates:"+dcnt);
System.err.println("HT:"+ht.size());
}
public static String usage()
{
return "usage:\n\t<sam or bam file> [index file if using bam] <output file>";
}
}