##################################################################################################
## This code counts duplications from gnomAD genomes and exomes and from clinvar, including overlaps.
## To reduce memory usage it reads in one chromosome at and a time from the vcfs, and just selected 
## fields that with be needed here. It would likely have been quicker and easier to use bcftools to 
## annotate the exome vcf with a field indicating whether it occurs in the genome vcf, or vice versa, 
## and then use those annotations to compute sizes of intersections and unions.
##################################################################################################

library(VariantAnnotation)

vcf.dir = "~/vt_out"

## The preprocessing steps for the input vcfs were the following, applied sequentially 
## with the output from the one step becoming the input for the next. The output isn't
## explicitly specified here, STDOUT can be redirected to a file, or piped to bgzip
## to compress, or in some cases can be piped directly to the next step as input.
## Some of the steps may accept vcfs compressed with bgzip and indexed by tabix.

## Decompose multallelic sites:  
##    vt decompose -s <input.vcf>
## Normalize variants:
##    vt normalize -n -r Homo_sapiens_assembly19.fasta <input.vcf>
## Restrict to insertions:
##    vt view -h -f "VTYPE==INDEL&&DLEN>0" <input.vcf>
## Add a few fields using modified version (-h) of vt annotate_indels:
##    vt annotate_indels -h -r Homo_sapiens_assembly19.fasta <input.vcf> 

## Annotate preprocessed gnomAD vcf with preprocessed ClinVar vcfs:
##    bcftools annotate -a <clinvar.vcf.gz> -c INFO <gnomad.vcf.gz>

## The input files are the preprocessed version of the files from gnomAD version 2.0.2 
## (http://gnomad.broadinstitute.org/downloads), whose names are the same as below
## but with ".vtdn.insertions.hai.cv" removed.

fl.gnA = file.path(vcf.dir, "gnomad.genomes.r2.0.2.sites.coding_only.chr1-22.vtdn.insertions.hai.cv.vcf.gz")
fl.gnX = file.path(vcf.dir, "gnomad.genomes.r2.0.2.sites.coding_only.chrX.vtdn.insertions.hai.cv.vcf.gz")
fl.ex = file.path(vcf.dir, "gnomad.exomes.r2.0.2.sites.vtdn.insertions.hai.cv.vcf.gz")

flx.gnA = TabixFile(fl.gnA, paste0(fl.gnA, ".tbi"))
flx.gnX = TabixFile(fl.gnX, paste0(fl.gnX, ".tbi"))
flx.ex = TabixFile(fl.ex, paste0(fl.ex, ".tbi"))

cap.ins = 40; ## upper limit for table

## read in fasta index (samtools faidx) from reference genome to get the chromosome lengths
## http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta
contigs = read.table("Homo_sapiens_assembly19.fasta.fai", sep="\t", stringsAsFactors=F)[,1:2]
colnames(contigs) = c("chr", "length")
rownames(contigs) = contigs[,1]

open(flx.gnA)
open(flx.gnX)
open(flx.ex)

chrom.list = c(1:22, "X")
count.list = list()

param = ScanVcfParam(which=GRanges("1", IRanges(1, 1000000)))
peek.gn = readVcf(flx.gnA, "GRCh37", param=param, verbose=T, row.names=F)
peek.ex = readVcf(flx.ex, "GRCh37", param=param, verbose=T, row.names=F)

info.gn = data.frame(info(header(peek.gn)))
info.ex = data.frame(info(header(peek.ex)))

info.gn[grepl("GC_[A-Z]{3}$", rownames(info.gn)),3, drop=F]
# Description
# GC_AFR        Count of African/African American individuals for each genotype
# GC_AMR                Count of Admixed American individuals for each genotype
# GC_ASJ                Count of Ashkenazi Jewish individuals for each genotype
# GC_EAS                      Count of East Asian individuals for each genotype
# GC_FIN                         Count of Finnish individuals for each genotype
# GC_NFE            Count of Non-Finnish European individuals for each genotype
# GC_OTH Count of Other (population not assigned) individuals for each genotype                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

info.ex[grepl("GC_[A-Z]{3}$", rownames(info.ex)),3, drop=F]
# Description
# GC_AFR        Count of African/African American individuals for each genotype
# GC_AMR                Count of Admixed American individuals for each genotype
# GC_ASJ                Count of Ashkenazi Jewish individuals for each genotype
# GC_EAS                      Count of East Asian individuals for each genotype
# GC_FIN                         Count of Finnish individuals for each genotype
# GC_NFE            Count of Non-Finnish European individuals for each genotype
# GC_OTH Count of Other (population not assigned) individuals for each genotype
# GC_SAS                     Count of South Asian individuals for each genotype

for (chrom in chrom.list) {
  
  cat("################## chrom", chrom, "#########################\n")
  
  param = ScanVcfParam(fixed="ALT", info=c("SHIFT_SYMM", "TDC", "CLNSIG"), geno=NA, 
                       which=GRanges(chrom, 
                                     IRanges(1, contigs$length[which(contigs$chr==chrom)])))
  
  if (chrom=="X") {
    vcf.gn = readVcf(flx.gnX, "GRCh37", param=param, verbose=T, row.names=F)
  } else {
    vcf.gn = readVcf(flx.gnA, "GRCh37", param=param, verbose=T, row.names=F)
  }
  vcf.ex = readVcf(flx.ex, "GRCh37", param=param, verbose=T, row.names=F)
  
  ALT.gn = sapply(CharacterList(fixed(vcf.gn)$ALT), "[[", 1);
  ALT.ex = sapply(CharacterList(fixed(vcf.ex)$ALT), "[[", 1);
  
  ID.gn = paste(seqnames(vcf.gn), start(vcf.gn), ref(vcf.gn), ALT.gn, sep="_")
  ID.ex = paste(seqnames(vcf.ex), start(vcf.ex), ref(vcf.ex), ALT.ex, sep="_")

  if(any(duplicated(ID.gn))) warn("duplicate ID in genomes?")
  if(any(duplicated(ID.ex))) warn("duplicate ID in exons?")
  
  just.ex = which(!is.element(ID.ex, ID.gn))
  just.gn = which(!is.element(ID.gn, ID.ex))
  
  length(just.ex) + length(just.gn) + length(intersect(ID.gn, ID.ex))
  length(ID.gn) + length(ID.ex) - length(intersect(ID.gn, ID.ex))
  # should be same (provided IDs are unique within file)
 
  ###################################

  summarizeCounts = function(vcf, maxn=1000) {
    ins.size = nchar(sapply(CharacterList(fixed(vcf)$ALT), "[[", 1)) - width(ref(vcf))
    cat("max insert", max(ins.size), "\n")
    if (is.na(maxn)) maxn = max(ins.size);
    ins.size = pmin(ins.size, maxn); ## cap for tabulate
    clinsig = sapply(info(vcf)$CLNSIG, paste, collapse=";")
    # note that these are nested, so each sum can only be arrived at one way
    clin.group = (clinsig!="") + grepl("Path|_path", clinsig) + grepl("Path", clinsig)
    counts = data.frame(
      insertions = tabulate(ins.size, nbins=maxn),
      dup =    tabulate(ins.size[info(vcf)$TDC>0], nbins=maxn),
      dup2 =   tabulate(ins.size[info(vcf)$TDC==1], nbins=maxn),
      dup2i =  tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0], nbins=maxn),
      dup2iC = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0 & clin.group>0], nbins=maxn),
      dup2iL = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0 & clin.group>1], nbins=maxn),
      dup2iP = tabulate(ins.size[info(vcf)$TDC==1 & info(vcf)$SHIFT_SYMM==0 & clin.group>2], nbins=maxn)
    )
    counts;
  }
  
  ct1 = summarizeCounts(vcf.gn)            ## includes intersection
  # ct2 = summarizeCounts(vcf.ex)
  ct2x = summarizeCounts(vcf.ex[just.ex,]) ## exlusive to exomes
  
  ##### intersection
  # ct1i = summarizeCounts(vcf.gn[-just.gn,])
  # ct2i = summarizeCounts(vcf.ex[-just.ex,])
  # all(ct1i==ct2i)
  
   ## need same nrows; could pad if not...
  count.list[[chrom]] = ct1 + ct2x;
  
}

save(count.list, file="counts_by_chrom_GE.RData")

# counts = count.list[[1]];
# for (k in 2:length(count.list)) counts = counts + count.list[[k]]