################################################################################################## ## This code counts duplications from gnomAD genomes and exomes and from clinvar, including overlaps. ## To reduce memory usage it reads in one chromosome at and a time from the vcfs, and just selected ## fields that with be needed here. It would likely have been quicker and easier to use bcftools to ## annotate the exome vcf with a field indicating whether it occurs in the genome vcf, or vice versa, ## and then use those annotations to compute sizes of intersections and unions. ################################################################################################## library(VariantAnnotation) vcf.dir = "~/vt_out" ## The preprocessing steps for the input vcfs were the following, applied sequentially ## with the output from the one step becoming the input for the next. The output isn't ## explicitly specified here, STDOUT can be redirected to a file, or piped to bgzip ## to compress, or in some cases can be piped directly to the next step as input. ## Some of the steps may accept vcfs compressed with bgzip and indexed by tabix. ## Decompose multallelic sites: ## vt decompose -s ## Normalize variants: ## vt normalize -n -r Homo_sapiens_assembly19.fasta ## Restrict to insertions: ## vt view -h -f "VTYPE==INDEL&&DLEN>0" ## Add a few fields using modified version (-h) of vt annotate_indels: ## vt annotate_indels -h -r Homo_sapiens_assembly19.fasta ## Annotate preprocessed gnomAD vcf with preprocessed ClinVar vcfs: ## bcftools annotate -a -c INFO ## The input files are the preprocessed version of the files from gnomAD version 2.0.2 ## (http://gnomad.broadinstitute.org/downloads), whose names are the same as below ## but with ".vtdn.insertions.hai.cv" removed. fl.gnA = file.path(vcf.dir, "gnomad.genomes.r2.0.2.sites.coding_only.chr1-22.vtdn.insertions.hai.cv.vcf.gz") fl.gnX = file.path(vcf.dir, "gnomad.genomes.r2.0.2.sites.coding_only.chrX.vtdn.insertions.hai.cv.vcf.gz") fl.ex = file.path(vcf.dir, "gnomad.exomes.r2.0.2.sites.vtdn.insertions.hai.cv.vcf.gz") flx.gnA = TabixFile(fl.gnA, paste0(fl.gnA, ".tbi")) flx.gnX = TabixFile(fl.gnX, paste0(fl.gnX, ".tbi")) flx.ex = TabixFile(fl.ex, paste0(fl.ex, ".tbi")) cap.ins = 40; ## upper limit for table ## read in fasta index (samtools faidx) from reference genome to get the chromosome lengths ## http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta contigs = read.table("Homo_sapiens_assembly19.fasta.fai", sep="\t", stringsAsFactors=F)[,1:2] colnames(contigs) = c("chr", "length") rownames(contigs) = contigs[,1] open(flx.gnA) open(flx.gnX) open(flx.ex) chrom.list = c(1:22, "X") count.list = list() param = ScanVcfParam(which=GRanges("1", IRanges(1, 1000000))) peek.gn = readVcf(flx.gnA, "GRCh37", param=param, verbose=T, row.names=F) peek.ex = readVcf(flx.ex, "GRCh37", param=param, verbose=T, row.names=F) info.gn = data.frame(info(header(peek.gn))) info.ex = data.frame(info(header(peek.ex))) info.gn[grepl("GC_[A-Z]{3}$", rownames(info.gn)),3, drop=F] # Description # GC_AFR Count of African/African American individuals for each genotype # GC_AMR Count of Admixed American individuals for each genotype # GC_ASJ Count of Ashkenazi Jewish individuals for each genotype # GC_EAS Count of East Asian individuals for each genotype # GC_FIN Count of Finnish individuals for each genotype # GC_NFE Count of Non-Finnish European individuals for each genotype # GC_OTH Count of Other (population not assigned) individuals for each genotype info.ex[grepl("GC_[A-Z]{3}$", rownames(info.ex)),3, drop=F] # Description # GC_AFR Count of African/African American individuals for each genotype # GC_AMR Count of Admixed American individuals for each genotype # GC_ASJ Count of Ashkenazi Jewish individuals for each genotype # GC_EAS Count of East Asian individuals for each genotype # GC_FIN Count of Finnish individuals for each genotype # GC_NFE Count of Non-Finnish European individuals for each genotype # GC_OTH Count of Other (population not assigned) individuals for each genotype # GC_SAS Count of South Asian individuals for each genotype for (chrom in chrom.list) { cat("################## chrom", chrom, "#########################\n") param = ScanVcfParam(fixed="ALT", info=c("SHIFT_SYMM", "TDC", "CLNSIG"), geno=NA, which=GRanges(chrom, IRanges(1, contigs$length[which(contigs$chr==chrom)]))) if (chrom=="X") { vcf.gn = readVcf(flx.gnX, "GRCh37", param=param, verbose=T, row.names=F) } else { vcf.gn = readVcf(flx.gnA, "GRCh37", param=param, verbose=T, row.names=F) } vcf.ex = readVcf(flx.ex, "GRCh37", param=param, verbose=T, row.names=F) ALT.gn = sapply(CharacterList(fixed(vcf.gn)$ALT), "[[", 1); ALT.ex = sapply(CharacterList(fixed(vcf.ex)$ALT), "[[", 1); ID.gn = paste(seqnames(vcf.gn), start(vcf.gn), ref(vcf.gn), ALT.gn, sep="_") ID.ex = paste(seqnames(vcf.ex), start(vcf.ex), ref(vcf.ex), ALT.ex, sep="_") if(any(duplicated(ID.gn))) warn("duplicate ID in genomes?") if(any(duplicated(ID.ex))) warn("duplicate ID in exons?") just.ex = which(!is.element(ID.ex, ID.gn)) just.gn = which(!is.element(ID.gn, ID.ex)) length(just.ex) + length(just.gn) + length(intersect(ID.gn, ID.ex)) length(ID.gn) + length(ID.ex) - length(intersect(ID.gn, ID.ex)) # should be same (provided IDs are unique within file) ################################### summarizeCounts = function(vcf, maxn=1000) { ins.size = nchar(sapply(CharacterList(fixed(vcf)$ALT), "[[", 1)) - width(ref(vcf)) cat("max insert", max(ins.size), "\n") if (is.na(maxn)) maxn = max(ins.size); ins.size = pmin(ins.size, maxn); ## cap for tabulate clinsig = sapply(info(vcf)$CLNSIG, paste, collapse=";") # note that these are nested, so each sum can only be arrived at one way clin.group = (clinsig!="") + grepl("Path|_path", clinsig) + grepl("Path", clinsig) counts = data.frame( insertions = tabulate(ins.size, nbins=maxn), dup = tabulate(ins.size[info(vcf)$TDC>0], nbins=maxn), dup2 = tabulate(ins.size[info(vcf)$TDC==1], nbins=maxn), dup2i = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0], nbins=maxn), dup2iC = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0 & clin.group>0], nbins=maxn), dup2iL = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0 & clin.group>1], nbins=maxn), dup2iP = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0 & clin.group>2], nbins=maxn) ) counts; } ct1 = summarizeCounts(vcf.gn) ## includes intersection # ct2 = summarizeCounts(vcf.ex) ct2x = summarizeCounts(vcf.ex[just.ex,]) ## exlusive to exomes ##### intersection # ct1i = summarizeCounts(vcf.gn[-just.gn,]) # ct2i = summarizeCounts(vcf.ex[-just.ex,]) # all(ct1i==ct2i) ## need same nrows; could pad if not... count.list[[chrom]] = ct1 + ct2x; } save(count.list, file="counts_by_chrom_GE.RData") # counts = count.list[[1]]; # for (k in 2:length(count.list)) counts = counts + count.list[[k]]