From 20e3ba15cafdcaf39fad8cde180a5e8036a17a57 Mon Sep 17 00:00:00 2001 From: kshakir Date: Fri, 26 Mar 2010 19:38:57 +0000 Subject: [PATCH] Added an optional argument -rgbl --read_group_black_list to filter read groups. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3079 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 3 + .../arguments/GATKArgumentCollection.java | 4 + .../filters/ReadGroupBlackListFilter.java | 102 +++++++ .../filters/ReadGroupBlackListFilterTest.java | 254 ++++++++++++++++++ 4 files changed, 363 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java create mode 100644 java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 775f43ce6..7120886ef 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -44,6 +44,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.filters.FilterManager; +import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.bed.BedParser; @@ -255,6 +256,8 @@ public class GenomeAnalysisEngine { filters.addAll(WalkerManager.getReadFilters(walker,filterManager)); if (args.filterZeroMappingQualityReads != null && args.filterZeroMappingQualityReads) filters.add(new ZeroMappingQualityReadFilter()); + if (args.readGroupBlackList != null && args.readGroupBlackList.size() > 0) + filters.add(new ReadGroupBlackListFilter(args.readGroupBlackList)); for(String filterName: args.readFilters) filters.add(filterManager.createByName(filterName)); return Collections.unmodifiableSet(filters); diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index f742901f1..80d3c6b3f 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -158,6 +158,10 @@ public class GATKArgumentCollection { @Argument(fullName = "disable_experimental_sharding",shortName="ds", doc="Disable the experimental sharding strategy.", required = false) public boolean disableExperimentalSharding = false; + @Element(required = false) + @Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read with read groups matching : or a .txt file containing the filter strings one per line.", required = false) + public List readGroupBlackList = null; + /** * marshal the data out to a object * diff --git a/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java new file mode 100644 index 000000000..139df5c05 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilter.java @@ -0,0 +1,102 @@ +package org.broadinstitute.sting.gatk.filters; + +import java.util.*; +import java.util.Map.Entry; +import java.io.File; +import java.io.FileNotFoundException; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.xReadLines; + +/** + * Removes records matching the read group tag and match string. + * For example each of the filter values: + * PU:1000G-mpimg-080821-1_1 + * PU:1000G-mpimg-080821 + * PU:mpimg-080821-1_1 + * PU:mpimg-080821 + * + * would filter out a read with the read group PU:1000G-mpimg-080821-1_1 + */ +public class ReadGroupBlackListFilter implements SamRecordFilter { + private Set>> filterEntries; + + public ReadGroupBlackListFilter(List blackLists) { + Map> filters = new TreeMap>(); + for (String blackList : blackLists) + addFilter(filters, blackList, null, 0); + this.filterEntries = filters.entrySet(); + } + + public boolean filterOut(SAMRecord samRecord) { + for (Entry> filterEntry : filterEntries) { + String attributeType = filterEntry.getKey(); + + Object attribute = samRecord.getAttribute(attributeType); + if (attribute == null) { + SAMReadGroupRecord samReadGroupRecord = samRecord.getReadGroup(); + if (samReadGroupRecord != null) { + attribute = samReadGroupRecord.getAttribute(attributeType); + } + } + + if (attribute != null) + for (String filterValue : filterEntry.getValue()) + if (attribute.toString().contains(filterValue)) + return true; + } + + return false; + } + + private void addFilter(Map> filters, String filter, File parentFile, int parentLineNum) { + if (filter.toLowerCase().endsWith(".txt")) { + File file = new File(filter); + try { + int lineNum = 0; + xReadLines lines = new xReadLines(file); + for (String line : lines) { + lineNum++; + + if (line.trim().length() == 0) + continue; + + if (line.startsWith("#")) + continue; + + addFilter(filters, line, file, lineNum); + } + } catch (FileNotFoundException e) { + String message = "Error loading black list: " + file.getAbsolutePath(); + if (parentFile != null) { + message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum; + } + throw new StingException(message, e); + } + } else { + String[] filterEntry = filter.split(":", 2); + + String message = null; + if (filterEntry.length != 2) { + message = "Invalid read group filter: " + filter; + } else if (filterEntry[0].length() != 2) { + message = "Tag is not two characters: " + filter; + } + + if (message != null) { + if (parentFile != null) { + message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum; + } + message += ", format is :"; + throw new StingException(message); + } + + if (!filters.containsKey(filterEntry[0])) + filters.put(filterEntry[0], new TreeSet()); + filters.get(filterEntry[0]).add(filterEntry[1]); + } + } +} diff --git a/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterTest.java b/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterTest.java new file mode 100644 index 000000000..431c9441b --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/filters/ReadGroupBlackListFilterTest.java @@ -0,0 +1,254 @@ +package org.broadinstitute.sting.gatk.filters; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.Assert; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + +import java.util.List; +import java.util.ArrayList; +import java.util.Collections; + +public class ReadGroupBlackListFilterTest extends BaseTest { + private static final int READ_GROUP_COUNT = 5; + private static final String READ_GROUP_PREFIX = "ReadGroup"; + private static final String SAMPLE_NAME_PREFIX = "Sample"; + private static final String PLATFORM_PREFIX = "Platform"; + private static final String PLATFORM_UNIT_PREFIX = "Lane"; + private static SAMFileHeader header; + + @BeforeClass + public static void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + List readGroupIDs = new ArrayList(); + List sampleNames = new ArrayList(); + + for (int i = 1; i <= READ_GROUP_COUNT; i++) { + readGroupIDs.add(READ_GROUP_PREFIX + i); + sampleNames.add(SAMPLE_NAME_PREFIX + i); + } + + ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroupIDs, sampleNames); + + for (int i = 1; i <= READ_GROUP_COUNT; i++) { + SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + i); + groupRecord.setAttribute("PL", PLATFORM_PREFIX + (((i-1)%2)+1)); + groupRecord.setAttribute("PU", PLATFORM_UNIT_PREFIX + (((i-1)%3)+1)); + } + + GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary()); + } + + @Test(expected = StingException.class) + public void testBadFilter() { + List badFilters = Collections.singletonList("bad"); + new ReadGroupBlackListFilter(badFilters); + } + @Test(expected = StingException.class) + public void testBadFilterTag() { + List badFilters = Collections.singletonList("bad:filter"); + new ReadGroupBlackListFilter(badFilters); + } + + @Test(expected = StingException.class) + public void testBadFilterFile() { + List badFilters = Collections.singletonList("/foo/bar/rgbl.txt"); + new ReadGroupBlackListFilter(badFilters); + } + + @Test + public void testFilterReadGroup() { + SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20); + filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1"); + + SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20); + unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2"); + + List filterList = new ArrayList(); + filterList.add("RG:" + READ_GROUP_PREFIX + "1"); + + ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList); + Assert.assertTrue(filter.filterOut(filteredRecord)); + Assert.assertFalse(filter.filterOut(unfilteredRecord)); + } + + @Test + public void testFilterPlatformUnit() { + SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20); + filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1"); + + SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20); + unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2"); + + List filterList = new ArrayList(); + filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1"); + + ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList); + Assert.assertTrue(filter.filterOut(filteredRecord)); + Assert.assertFalse(filter.filterOut(unfilteredRecord)); + } + + @Test + public void testFilterOutByReadGroup() { + int recordsPerGroup = 3; + List records = new ArrayList(); + int alignmentStart = 0; + for (int x = 1; x <= READ_GROUP_COUNT; x++) { + SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x); + for (int y = 1; y <= recordsPerGroup; y++) { + SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20); + record.setAttribute("RG", groupRecord.getReadGroupId()); + records.add(record); + } + } + + List filterList = new ArrayList(); + filterList.add("RG:" + READ_GROUP_PREFIX + "1"); + filterList.add("RG:" + READ_GROUP_PREFIX + "3"); + + ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList); + int filtered = 0; + int unfiltered = 0; + for (SAMRecord record : records) { + String readGroupName = record.getReadGroup().getReadGroupId(); + if (filter.filterOut(record)) { + if (!filterList.contains("RG:" + readGroupName)) + Assert.fail("Read group " + readGroupName + " was filtered"); + filtered++; + } else { + if (filterList.contains("RG:" + readGroupName)) + Assert.fail("Read group " + readGroupName + " was not filtered"); + unfiltered++; + } + } + + int filteredExpected = recordsPerGroup * 2; + int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2); + Assert.assertEquals("Filtered", filteredExpected, filtered); + Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered); + } + + @Test + public void testFilterOutByAttribute() { + int recordsPerGroup = 3; + List records = new ArrayList(); + int alignmentStart = 0; + for (int x = 1; x <= READ_GROUP_COUNT; x++) { + SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x); + for (int y = 1; y <= recordsPerGroup; y++) { + SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20); + record.setAttribute("RG", groupRecord.getReadGroupId()); + records.add(record); + } + } + + List filterList = new ArrayList(); + filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1"); + + ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList); + int filtered = 0; + int unfiltered = 0; + for (SAMRecord record : records) { + String platformUnit = (String) record.getReadGroup().getAttribute("PU"); + if (filter.filterOut(record)) { + if (!filterList.contains("PU:" + platformUnit)) + Assert.fail("Platform unit " + platformUnit + " was filtered"); + filtered++; + } else { + if (filterList.contains("PU:" + platformUnit)) + Assert.fail("Platform unit " + platformUnit + " was not filtered"); + unfiltered++; + } + } + + int filteredExpected = 6; + int unfilteredExpected = 9; + Assert.assertEquals("Filtered", filteredExpected, filtered); + Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered); + } + + @Test + public void testFilterOutByFile() { + int recordsPerGroup = 3; + List records = new ArrayList(); + int alignmentStart = 0; + for (int x = 1; x <= READ_GROUP_COUNT; x++) { + SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x); + for (int y = 1; y <= recordsPerGroup; y++) { + SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20); + record.setAttribute("RG", groupRecord.getReadGroupId()); + records.add(record); + } + } + + List filterList = new ArrayList(); + filterList.add(validationDataLocation + "readgroupblacklisttest.txt"); + + ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList); + int filtered = 0; + int unfiltered = 0; + for (SAMRecord record : records) { + String readGroup = record.getReadGroup().getReadGroupId(); + if (filter.filterOut(record)) { + if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))) + Assert.fail("Read group " + readGroup + " was filtered"); + filtered++; + } else { + if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)) + Assert.fail("Read group " + readGroup + " was not filtered"); + unfiltered++; + } + } + + int filteredExpected = recordsPerGroup * 2; + int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2); + Assert.assertEquals("Filtered", filteredExpected, filtered); + Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered); + } + + @Test + public void testFilterOutByListFile() { + int recordsPerGroup = 3; + List records = new ArrayList(); + int alignmentStart = 0; + for (int x = 1; x <= READ_GROUP_COUNT; x++) { + SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x); + for (int y = 1; y <= recordsPerGroup; y++) { + SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20); + record.setAttribute("RG", groupRecord.getReadGroupId()); + records.add(record); + } + } + + List filterList = new ArrayList(); + filterList.add(validationDataLocation + "readgroupblacklisttestlist.txt"); + + ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList); + int filtered = 0; + int unfiltered = 0; + for (SAMRecord record : records) { + String readGroup = record.getReadGroup().getReadGroupId(); + if (filter.filterOut(record)) { + if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))) + Assert.fail("Read group " + readGroup + " was filtered"); + filtered++; + } else { + if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)) + Assert.fail("Read group " + readGroup + " was not filtered"); + unfiltered++; + } + } + + int filteredExpected = recordsPerGroup * 2; + int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2); + Assert.assertEquals("Filtered", filteredExpected, filtered); + Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered); + } +}