Added an optional argument -rgbl --read_group_black_list to filter read groups.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3079 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2010-03-26 19:38:57 +00:00
parent 73a14a985b
commit 20e3ba15ca
4 changed files with 363 additions and 0 deletions

View File

@ -44,6 +44,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.bed.BedParser;
@ -255,6 +256,8 @@ public class GenomeAnalysisEngine {
filters.addAll(WalkerManager.getReadFilters(walker,filterManager));
if (args.filterZeroMappingQualityReads != null && args.filterZeroMappingQualityReads)
filters.add(new ZeroMappingQualityReadFilter());
if (args.readGroupBlackList != null && args.readGroupBlackList.size() > 0)
filters.add(new ReadGroupBlackListFilter(args.readGroupBlackList));
for(String filterName: args.readFilters)
filters.add(filterManager.createByName(filterName));
return Collections.unmodifiableSet(filters);

View File

@ -158,6 +158,10 @@ public class GATKArgumentCollection {
@Argument(fullName = "disable_experimental_sharding",shortName="ds", doc="Disable the experimental sharding strategy.", required = false)
public boolean disableExperimentalSharding = false;
@Element(required = false)
@Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read with read groups matching <TAG>:<SUBSTRING> or a .txt file containing the filter strings one per line.", required = false)
public List<String> readGroupBlackList = null;
/**
* marshal the data out to a object
*

View File

@ -0,0 +1,102 @@
package org.broadinstitute.sting.gatk.filters;
import java.util.*;
import java.util.Map.Entry;
import java.io.File;
import java.io.FileNotFoundException;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.xReadLines;
/**
* Removes records matching the read group tag and match string.
* For example each of the filter values:
* PU:1000G-mpimg-080821-1_1
* PU:1000G-mpimg-080821
* PU:mpimg-080821-1_1
* PU:mpimg-080821
*
* would filter out a read with the read group PU:1000G-mpimg-080821-1_1
*/
public class ReadGroupBlackListFilter implements SamRecordFilter {
private Set<Entry<String, Collection<String>>> filterEntries;
public ReadGroupBlackListFilter(List<String> blackLists) {
Map<String, Collection<String>> filters = new TreeMap<String, Collection<String>>();
for (String blackList : blackLists)
addFilter(filters, blackList, null, 0);
this.filterEntries = filters.entrySet();
}
public boolean filterOut(SAMRecord samRecord) {
for (Entry<String, Collection<String>> filterEntry : filterEntries) {
String attributeType = filterEntry.getKey();
Object attribute = samRecord.getAttribute(attributeType);
if (attribute == null) {
SAMReadGroupRecord samReadGroupRecord = samRecord.getReadGroup();
if (samReadGroupRecord != null) {
attribute = samReadGroupRecord.getAttribute(attributeType);
}
}
if (attribute != null)
for (String filterValue : filterEntry.getValue())
if (attribute.toString().contains(filterValue))
return true;
}
return false;
}
private void addFilter(Map<String, Collection<String>> filters, String filter, File parentFile, int parentLineNum) {
if (filter.toLowerCase().endsWith(".txt")) {
File file = new File(filter);
try {
int lineNum = 0;
xReadLines lines = new xReadLines(file);
for (String line : lines) {
lineNum++;
if (line.trim().length() == 0)
continue;
if (line.startsWith("#"))
continue;
addFilter(filters, line, file, lineNum);
}
} catch (FileNotFoundException e) {
String message = "Error loading black list: " + file.getAbsolutePath();
if (parentFile != null) {
message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum;
}
throw new StingException(message, e);
}
} else {
String[] filterEntry = filter.split(":", 2);
String message = null;
if (filterEntry.length != 2) {
message = "Invalid read group filter: " + filter;
} else if (filterEntry[0].length() != 2) {
message = "Tag is not two characters: " + filter;
}
if (message != null) {
if (parentFile != null) {
message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum;
}
message += ", format is <TAG>:<SUBSTRING>";
throw new StingException(message);
}
if (!filters.containsKey(filterEntry[0]))
filters.put(filterEntry[0], new TreeSet<String>());
filters.get(filterEntry[0]).add(filterEntry[1]);
}
}
}

View File

@ -0,0 +1,254 @@
package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.Assert;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
public class ReadGroupBlackListFilterTest extends BaseTest {
private static final int READ_GROUP_COUNT = 5;
private static final String READ_GROUP_PREFIX = "ReadGroup";
private static final String SAMPLE_NAME_PREFIX = "Sample";
private static final String PLATFORM_PREFIX = "Platform";
private static final String PLATFORM_UNIT_PREFIX = "Lane";
private static SAMFileHeader header;
@BeforeClass
public static void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
List<String> readGroupIDs = new ArrayList<String>();
List<String> sampleNames = new ArrayList<String>();
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
readGroupIDs.add(READ_GROUP_PREFIX + i);
sampleNames.add(SAMPLE_NAME_PREFIX + i);
}
ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroupIDs, sampleNames);
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + i);
groupRecord.setAttribute("PL", PLATFORM_PREFIX + (((i-1)%2)+1));
groupRecord.setAttribute("PU", PLATFORM_UNIT_PREFIX + (((i-1)%3)+1));
}
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
}
@Test(expected = StingException.class)
public void testBadFilter() {
List<String> badFilters = Collections.singletonList("bad");
new ReadGroupBlackListFilter(badFilters);
}
@Test(expected = StingException.class)
public void testBadFilterTag() {
List<String> badFilters = Collections.singletonList("bad:filter");
new ReadGroupBlackListFilter(badFilters);
}
@Test(expected = StingException.class)
public void testBadFilterFile() {
List<String> badFilters = Collections.singletonList("/foo/bar/rgbl.txt");
new ReadGroupBlackListFilter(badFilters);
}
@Test
public void testFilterReadGroup() {
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
List<String> filterList = new ArrayList<String>();
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
Assert.assertTrue(filter.filterOut(filteredRecord));
Assert.assertFalse(filter.filterOut(unfilteredRecord));
}
@Test
public void testFilterPlatformUnit() {
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
List<String> filterList = new ArrayList<String>();
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
Assert.assertTrue(filter.filterOut(filteredRecord));
Assert.assertFalse(filter.filterOut(unfilteredRecord));
}
@Test
public void testFilterOutByReadGroup() {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
}
List<String> filterList = new ArrayList<String>();
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
filterList.add("RG:" + READ_GROUP_PREFIX + "3");
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
int filtered = 0;
int unfiltered = 0;
for (SAMRecord record : records) {
String readGroupName = record.getReadGroup().getReadGroupId();
if (filter.filterOut(record)) {
if (!filterList.contains("RG:" + readGroupName))
Assert.fail("Read group " + readGroupName + " was filtered");
filtered++;
} else {
if (filterList.contains("RG:" + readGroupName))
Assert.fail("Read group " + readGroupName + " was not filtered");
unfiltered++;
}
}
int filteredExpected = recordsPerGroup * 2;
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
Assert.assertEquals("Filtered", filteredExpected, filtered);
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
}
@Test
public void testFilterOutByAttribute() {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
}
List<String> filterList = new ArrayList<String>();
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
int filtered = 0;
int unfiltered = 0;
for (SAMRecord record : records) {
String platformUnit = (String) record.getReadGroup().getAttribute("PU");
if (filter.filterOut(record)) {
if (!filterList.contains("PU:" + platformUnit))
Assert.fail("Platform unit " + platformUnit + " was filtered");
filtered++;
} else {
if (filterList.contains("PU:" + platformUnit))
Assert.fail("Platform unit " + platformUnit + " was not filtered");
unfiltered++;
}
}
int filteredExpected = 6;
int unfilteredExpected = 9;
Assert.assertEquals("Filtered", filteredExpected, filtered);
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
}
@Test
public void testFilterOutByFile() {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
}
List<String> filterList = new ArrayList<String>();
filterList.add(validationDataLocation + "readgroupblacklisttest.txt");
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
int filtered = 0;
int unfiltered = 0;
for (SAMRecord record : records) {
String readGroup = record.getReadGroup().getReadGroupId();
if (filter.filterOut(record)) {
if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)))
Assert.fail("Read group " + readGroup + " was filtered");
filtered++;
} else {
if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))
Assert.fail("Read group " + readGroup + " was not filtered");
unfiltered++;
}
}
int filteredExpected = recordsPerGroup * 2;
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
Assert.assertEquals("Filtered", filteredExpected, filtered);
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
}
@Test
public void testFilterOutByListFile() {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
}
List<String> filterList = new ArrayList<String>();
filterList.add(validationDataLocation + "readgroupblacklisttestlist.txt");
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
int filtered = 0;
int unfiltered = 0;
for (SAMRecord record : records) {
String readGroup = record.getReadGroup().getReadGroupId();
if (filter.filterOut(record)) {
if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)))
Assert.fail("Read group " + readGroup + " was filtered");
filtered++;
} else {
if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))
Assert.fail("Read group " + readGroup + " was not filtered");
unfiltered++;
}
}
int filteredExpected = recordsPerGroup * 2;
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
Assert.assertEquals("Filtered", filteredExpected, filtered);
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
}
}