From c38390eabb0756278ec46dac25e3251b3a650da9 Mon Sep 17 00:00:00 2001 From: sjia Date: Mon, 14 Jun 2010 16:08:49 +0000 Subject: [PATCH] Added option for min number of matches between reads and alleles required to consider reads. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3548 348d0f76-0448-11de-a6fe-93d51630548a --- .../CalculateBaseLikelihoodsWalker.java | 9 ++-- .../walkers/HLAcaller/ClusterReadsWalker.java | 9 ++-- ...eWalker.java => FindClosestHLAWalker.java} | 4 +- .../walkers/HLAcaller/HLACallerWalker.java | 46 ++++++++++++++----- .../HLAcaller/SimilarityFileReader.java | 4 +- 5 files changed, 50 insertions(+), 22 deletions(-) rename java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/{FindClosestAlleleWalker.java => FindClosestHLAWalker.java} (97%) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java index 27651cd6f..84205de97 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java @@ -52,8 +52,11 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker { @Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false) public String filterFile = ""; - @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) - public int MINALLOWEDMISMATCHES = 5; + @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false) + public int MAXALLOWEDMISMATCHES = 7; + + @Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false) + public int MINREQUIREDMATCHES = 0; String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles"; @@ -70,7 +73,7 @@ public class ClusterReadsWalker extends ReadWalker { if (!filterFile.equals("")){ out.printf("INFO Reading properties file ... "); SimilarityFileReader similarityReader = new SimilarityFileReader(); - similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); + similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES); ReadsToDiscard = similarityReader.GetReadsToDiscard(); MaxNumMatches = similarityReader.GetNumMatches(); MaxConcordance = similarityReader.GetConcordance(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestHLAWalker.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestHLAWalker.java index 41e4ae029..58513e9ab 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestAlleleWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/FindClosestHLAWalker.java @@ -34,11 +34,11 @@ import org.broadinstitute.sting.commandline.Argument; import java.util.ArrayList; import java.util.Hashtable; /** - * Finds the most similar HLA allele for each read. Usage: java -jar GenomeAnalysisTK.jar -T FindClosestAllele -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT + * Finds the most similar HLA allele for each read (helps detect misalignments). Usage: java -jar GenomeAnalysisTK.jar -T FindClosestHLA -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval | grep -v INFO | sort -k1 > OUTPUT * @author shermanjia */ @Requires({DataSource.READS, DataSource.REFERENCE}) -public class FindClosestAlleleWalker extends ReadWalker { +public class FindClosestHLAWalker extends ReadWalker { @Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false) public String debugRead = ""; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java index f4917afb3..8a98f6e17 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/HLACallerWalker.java @@ -36,7 +36,7 @@ import java.util.*; import java.util.Map.Entry; /** - * Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\ + * Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T HLACaller -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\ dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods * @author shermanjia */ @@ -63,8 +63,11 @@ public class HLACallerWalker extends ReadWalker { @Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false) public boolean ONLYFREQUENT = false; - @Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false) - public int MINALLOWEDMISMATCHES = 7; + @Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false) + public int MAXALLOWEDMISMATCHES = 6; + + @Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false) + public int MINREQUIREDMATCHES = 5; GATKArgumentCollection args = this.getToolkit().getArguments(); @@ -113,15 +116,34 @@ public class HLACallerWalker extends ReadWalker { HLAstoppos = HLADictionaryReader.GetStopPositions(); //Load pre-processing file for misaligned reads and list of alleles to search - - SimilarityFileReader similarityReader = new SimilarityFileReader(); - similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES); - ReadsToDiscard = similarityReader.GetReadsToDiscard(); - AllelesToSearch = similarityReader.GetAllelesToSearch(); - AlleleCount = similarityReader.GetAlleleCount(); - LocusCount = similarityReader.GetLocusCount(); - for (int i = 0; i < AllelesToSearch.size(); i++){ - out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i))); + + if (!filterFile.equals("")){ + //If pre-processing file exists, load contents + SimilarityFileReader similarityReader = new SimilarityFileReader(); + similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES); + ReadsToDiscard = similarityReader.GetReadsToDiscard(); + AllelesToSearch = similarityReader.GetAllelesToSearch(); + AlleleCount = similarityReader.GetAlleleCount(); + LocusCount = similarityReader.GetLocusCount(); + for (int i = 0; i < AllelesToSearch.size(); i++){ + out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i))); + } + }else{ + ReadsToDiscard = new ArrayList(); + AlleleCount = new Hashtable(); + String name, d4_name; String [] n; + for (int i = 0; i < HLAnames.length; i++){ + name = HLAnames[i].substring(4); + n = name.split("\\*"); + d4_name = n[0] + "*" + n[1].substring(0, 4); + if (!AllelesToSearch.contains(d4_name)){ + AllelesToSearch.add(d4_name); + AlleleCount.put(d4_name, 0); + } + if (!LocusCount.containsKey(n[0])){ + LocusCount.put(n[0], 0); + } + } } //Load genotypes and find polymorphic sites (sites that differ from reference) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java index 7356968ce..9d2b509cf 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/SimilarityFileReader.java @@ -53,7 +53,7 @@ public class SimilarityFileReader { return NumMismatches; } - public void ReadFile(String filename, int minAllowedMismatches){ + public void ReadFile(String filename, int minAllowedMismatches, int minRequiredMatches){ try{ FileInputStream fstream = new FileInputStream(filename); DataInputStream in = new DataInputStream(fstream); @@ -70,7 +70,7 @@ public class SimilarityFileReader { Concordance.put(s[0],matchFraction); NumMatches.put(s[0], s[4]); NumMismatches.put(s[0], numMismatches); - if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){ + if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < minRequiredMatches){ ReadsToDiscard.add(s[0]); }else{ Hashtable fourDigitAlleles = new Hashtable();