From 28d8b28bdf5f9d7930a4ba596abd3e653bb57fe6 Mon Sep 17 00:00:00 2001
From: droazen <droazen@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Wed, 22 Jun 2011 22:55:33 +0000
Subject: [PATCH] Density plots.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6061 348d0f76-0448-11de-a6fe-93d51630548a
---
 R/exomePreQC.R | 71 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 20 deletions(-)

diff --git a/R/exomePreQC.R b/R/exomePreQC.R
index b6315c148..f4f41f1bf 100644
--- a/R/exomePreQC.R
+++ b/R/exomePreQC.R
@@ -1,5 +1,3 @@
-require('ggplot2')
-
 args = commandArgs(TRUE)
 onCMDLine = ! is.na(args[1])
 
@@ -8,6 +6,9 @@ if ( onCMDLine ) {
   outputPDF = args[2]
 }
 
+require('ggplot2')
+
+inputTSV = "GoT2D_exomes_batch_005.tsv"
 data <- read.table(inputTSV,header=T)
 
 fingerprint_lods = list()
@@ -19,23 +20,53 @@ fingerprint_lod_order = order(unlist(lapply(fingerprint_lods,median),use.names=F
 
 pdf(outputPDF)
 boxplot(fingerprint_lods[fingerprint_lod_order],las=3,main='Fingerprint LOD Scores By Sample',xlab='Sample',ylab='LOD Score Distribution',cex.axis=0.65)
-qplot(sample,GENOME_SIZE,data=data) + opts(title='Genome Size per Sample')
-qplot(sample,PCT_SELECTED_BASES,data=data) + opts(title='On+Near Bait Bases/PF Bases Aligned per Sample')
-qplot(sample,MEAN_TARGET_COVERAGE,data=data) + opts(title='Mean Target Coverage per Sample')
-qplot(sample,ZERO_CVG_TARGETS_PCT,data=data) + opts(title='% of Targets with <2x Coverage per Sample')
-qplot(sample,FOLD_80_BASE_PENALTY,data=data) + opts(title='Fold 80 Base Penalty per Sample')
-qplot(sample,HS_LIBRARY_SIZE,data=data) + opts(title='Hybrid Sequencing Library Size per Sample')
-qplot(sample,PCT_PF_READS_ALIGNED,data=data) + opts(title='% PF Reads Aligned per Sample')
-qplot(sample,PF_HQ_ERROR_RATE,data=data) + opts(title='% HQ Bases mismatching the Reference  per Sample')
-qplot(sample,MEAN_READ_LENGTH,data=data) + opts(title='Median Read Length per Sample')
-qplot(sample,MEDIAN_INSERT_SIZE,data=data) + opts(title='Median Insert Size per Sample')
-qplot(sample,BAD_CYCLES,data=data) + opts(title='# Bad Cycles per Sample')
-qplot(sample,STRAND_BALANCE,data=data) + opts(title='% PF Reads Aligned to the + Strand per Sample')
-qplot(sample,PCT_CHIMERAS,data=data) + opts(title='% Chimera Read Pairs per Sample')
-qplot(sample,PCT_ADAPTER,data=data) + opts(title='% Unaligned Reads Matching an Adapter Sequence per Sample')
-qplot(sample,TOTAL_SNPS,data=data) + opts(title='# SNPs called per Sample')
-qplot(sample,NOVEL_SNPS,data=data) + opts(title='# Novel SNPs called per Sample')
-qplot(sample,PCT_DBSNP,data=data) + opts(title='% SNPs in dbSNP per Sample')
-qplot(sample,DBSNP_TITV,data=data) + opts(title='TiTv of SNPs in dbSNP per Sample')
+
+complete <- read.table('/Users/mhanna/metrics.perSample.formatted.table',header=T)
+novel <- subset(complete,exon_intervals == "whole_exome_agilent_1.1_refseq_plus_3_boosters"&Novelty=="novel"&FunctionalClass=="all")
+selected_samples <- novel$Sample %in% data$sample
+novel_with_highlights <- cbind(novel,selected_samples)
+
+qplot(Sample,Selected_Bases_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='On+Near Bait Bases/PF Bases Aligned per Sample')
+qplot(PCT_SELECTED_BASES,data=data,geom="histogram") + opts(title='On+Near Bait Bases (Distribution)')
+qplot(Sample,Mean_Target_Coverage,data=novel_with_highlights,color=selected_samples) + opts(title='Mean Target Coverage per Sample')
+qplot(MEAN_TARGET_COVERAGE,data=data,geom="histogram") + opts(title='Mean Target Coverage (Distribution)')
+qplot(Sample,Zero_Coverage_Targets_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% of Targets with <2x Coverage per Sample')
+qplot(ZERO_CVG_TARGETS_PCT,data=data,geom="histogram") + opts(title='% of Targets with <2x Coverage (Distribution)')
+qplot(Sample,Fold_80_Base_Penalty,data=novel_with_highlights,color=selected_samples) + opts(title='Fold 80 Base Penalty per Sample')
+qplot(FOLD_80_BASE_PENALTY,data=data,geom="histogram") + opts(title='Fold 80 Base Penalty (Distribution)')
+qplot(Sample,Target_Bases_2x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >2x Coverage per Sample')
+qplot(PCT_TARGET_BASES_2X,data=data,geom="histogram") + opts(title='% Target Bases Achieving >2x Coverage (Distribution)')
+qplot(Sample,Target_Bases_10x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >10x Coverage per Sample')
+qplot(PCT_TARGET_BASES_10X,data=data,geom="histogram") + opts(title='% Target Bases Achieving >10x Coverage (Distribution)')
+qplot(Sample,Target_Bases_20x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >20x Coverage per Sample')
+qplot(PCT_TARGET_BASES_20X,data=data,geom="histogram") + opts(title='% Target Bases Achieving >20x Coverage (Distribution)')
+qplot(Sample,Target_Bases_30x_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% Target Bases Achieving >30x Coverage per Sample')
+qplot(PCT_TARGET_BASES_30X,data=data,geom="histogram") + opts(title='% Target Bases Achieving >30x Coverage (Distribution)')
+qplot(Sample,PF_Reads_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned per Sample')
+qplot(PCT_PF_READS_ALIGNED,data=data,geom="histogram") + opts(title='% PF Reads Aligned (Distribution)')
+qplot(Sample,PF_HQ_Error_Rate,data=novel_with_highlights,color=selected_samples) + opts(title='% HQ Bases mismatching the Reference per Sample')
+qplot(PF_HQ_ERROR_RATE,data=data,geom="histogram") + opts(title='% HQ Bases mismatching the Reference (Distribution)')
+qplot(Sample,Mean_Read_Length,data=novel_with_highlights,color=selected_samples) + opts(title='Median Read Length per Sample')
+qplot(MEAN_READ_LENGTH,data=data,geom="histogram") + opts(title='Median Read Length (Distribution')
+qplot(Sample,Bad_Cycles,data=novel_with_highlights,color=selected_samples) + opts(title='# Bad Cycles per Sample')
+qplot(BAD_CYCLES,data=data,geom="histogram") + opts(title='# Bad Cycles (Distribution)')
+qplot(Sample,Strand_Balance_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% PF Reads Aligned to the + Strand per Sample')
+qplot(STRAND_BALANCE,data=data,geom="histogram") + opts(title='% PF Reads Aligned to the + Strand (Distribution)')
+qplot(Sample,Total_SNPs,data=novel_with_highlights,color=selected_samples) + opts(title='# SNPs called per Sample')
+qplot(TOTAL_SNPS,data=data,geom="histogram") + opts(title='# SNPs called (Distribution)')
+qplot(Sample,dbSNP_Pct,data=novel_with_highlights,color=selected_samples) + opts(title='% SNPs in dbSNP per Sample')
+qplot(PCT_DBSNP,data=data,geom="histogram") + opts(title='% SNPs in dbSNP per Sample')
 dev.off()
 
+#qplot(Sample,Library_Size_HS,data=novel_with_highlights,color=selected_samples) + opts(title='Hybrid Sequencing Library Size per Sample')
+qplot(HS_LIBRARY_SIZE,data=data) + opts(title='Hybrid Sequencing Library Size (Distribution)')
+#qplot(Sample,MEDIAN_INSERT_SIZE,data=novel_with_highlights,color=selected_samples) + opts(title='Median Insert Size per Sample')
+qplot(MEDIAN_INSERT_SIZE,data=data) + opts(title='Median Insert Size (Distribution)')
+#qplot(Sample,PCT_CHIMERAS,data=novel_with_highlights,color=selected_samples) + opts(title='% Chimera Read Pairs per Sample')
+qplot(PCT_CHIMERAS,data=data) + opts(title='% Chimera Read Pairs (Distribution)')
+#qplot(Sample,PCT_ADAPTER,data=novel_with_highlights,color=selected_samples) + opts(title='% Unaligned Reads Matching an Adapter Sequence per Sample')
+qplot(PCT_ADAPTER,data=data) + opts(title='% Unaligned Reads Matching an Adapter Sequence (Distribution)')
+#qplot(Sample,NOVEL_SNPS,data=novel_with_highlights,color=selected_samples) + opts(title='# Novel SNPs called per Sample')
+qplot(NOVEL_SNPS,data=data) + opts(title='# Novel SNPs called (Distribution)')
+#qplot(Sample,DBSNP_TITV,data=novel_with_highlights,color=selected_samples) + opts(title='TiTv of SNPs in dbSNP per Sample')
+qplot(DBSNP_TITV,data=data) + opts(title='TiTv of SNPs in dbSNP (Distribution)')