Demo entry 3544047

Preprocessing

   

Submitted by anonymous on Jan 12, 2016 at 15:48
Language: Python. Code size: 5.9 kB.

"""
Shira Verduin
Part 1/2 of CADDtoVCF, the pre-processing step.
In this script the header of the CADD file is transformed to a VCF header.
Shell scripts are being made to run the second (processing) step.
"""

import sys,getopt,os,gzip,glob
import threading

# Define options for -h, -i & -v
def main(argv):
  try:
    opts, args = getopt.getopt(sys.argv[1:], "h:i:v:t:", ["help", "input_cadd=","vcf_file=","threads="])
  except getopt.GetoptError:
    print '\nUsage: \n$ python CADDtoVCF_preprocessing -i <input_cadd> -v <vcf_file> -t <threads>\n'
    sys.exit(2)
    
  for opt, arg in opts:
    if opt == '-h':
      print '\nUsage: \n$ python CADDtoVCF_preprocessing -i <input_cadd> -v <vcf_file> -t <threads>\n'
      sys.exit()
    elif opt == "-i":
      inputcadd = arg
    elif opt == "-v":
      vcffile = arg    
    elif opt == "-t":
      threads = arg
      
  return inputcadd,vcffile,threads

# Use a original CADD file and create a header for the VCF.
def CADDtoHeader(inputcadd,vcffile,threads):
  try:
    headervals = gzip.open(inputcadd, "rb")
    headervals.readline()
  except IOError:
    headervals = open(inputcadd, "r")
    headervals.readline()

# Creates a folder with chunks of the <CHROM POS REF ALT> of the variant file.
  os.system("cat " + vcffile + " | zgrep -v '#' | awk '{print $1, $2, $4, $5}' > /hpc/cog_bioinf/data/sverduin/vcfpos.txt")
  os.system("rm -r /hpc/cog_bioinf/data/sverduin/chunks; mkdir /hpc/cog_bioinf/data/sverduin/chunks")
  os.system("split -l25000 /hpc/cog_bioinf/data/sverduin/vcfpos.txt /hpc/cog_bioinf/data/sverduin/chunks/chunk_")
  file_out = open("/hpc/cog_bioinf/data/sverduin/header.txt", "w")
  
# Define standard information required for VCF.  
  version = ("##fileformat=VCFv4.1" + "\n")
  contigs = ("##contig=<ID=1,length=249250621>\n" +
            "##contig=<ID=2,length=243199373>\n" +
            "##contig=<ID=3,length=198022430>\n" +
            "##contig=<ID=4,length=191154276>\n" +
            "##contig=<ID=5,length=180915260>\n" +
            "##contig=<ID=6,length=171115067>\n" +
            "##contig=<ID=7,length=159138663>\n" +
            "##contig=<ID=8,length=146364022>\n" +
            "##contig=<ID=9,length=141213431>\n" +
            "##contig=<ID=10,length=135534747>\n" +
            "##contig=<ID=11,length=135006516>\n" +
            "##contig=<ID=12,length=133851895>\n" +
            "##contig=<ID=13,length=115169878>\n" +
            "##contig=<ID=14,length=107349540>\n" +
            "##contig=<ID=15,length=102531392>\n" +
            "##contig=<ID=16,length=90354753>\n" +
            "##contig=<ID=17,length=81195210>\n" +
            "##contig=<ID=18,length=78077248>\n" +
            "##contig=<ID=19,length=59128983>\n" +
            "##contig=<ID=20,length=63025520>\n" +
            "##contig=<ID=21,length=48129895>\n" +
            "##contig=<ID=22,length=51304566>\n" +
            "##contig=<ID=X,length=155270560>\n" +
            "##contig=<ID=Y,length=59373566>\n" +
            "##contig=<ID=MT,length=16569>\n" +
            "##reference=file:///hpc/cog_bioinf/GENOMES/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta\n")
  table_header = "#CHROM        POS     ID      REF     ALT     QUAL    FILTER  INFO\n"
  datatype = ["Integer", "Integer","String","String","String","String","Integer","String","String","String",
              "String","Integer","String","Float","Float","Float","Float","Float","Float","Float",
              "Float","Float","Float","Float","Float","Float","Float","Float","Integer","Integer",
              "Float","Float","Float","Float","Float","Float","Float","Float","Float","Float",
              "Float","Float","Float","Float","Float","Float","Float","Float","Float","Float",
              "Float","Float","Float","Float","Float","Float","Float","Float","Float","Float",
              "Float","Float","Float","Float","Float","Float","Float","Float","Float","Float",
              "String","Float","Float","Integer","String","Integer","Float","Integer","Integer","Float",
              "String","Float","Float","Float","Float","Float","Float","Float","Float","Integer",
              "Integer","String","String","String","String","Integer","Float","Integer","Float","Integer",
              "Float","String","Integer","String",  "String","String","String","String","Integer","String",
              "Float","String","Float","Float","Float", "Float"]

# Transform the header of CADD to a header in VCF format.
  header = headervals.readline().split()
  file_out.write(version)
  for y in range (len(datatype)):
    file_out.write("##INFO=<ID=" + header[y] + ",Number=A,Type=" + datatype[y] + ",Description=\"" + header[y] + "\">\n")
  file_out.write(contigs)
  file_out.write(table_header)
  file_out.close()
  
# Creates shell scripts that can run the second (processing) part of CADDtoVCF for every chunk.
  for files in os.listdir("/hpc/cog_bioinf/data/sverduin/chunks"):
    os.system("cp /hpc/cog_bioinf/data/sverduin/header.txt /hpc/cog_bioinf/data/sverduin/chunks/%s" % (files+".vcf"))
    sh_script = open(files +".sh", "w")
    sh_script.write("#!/bin/bash\n. /hpc/cog_bioinf/data/sverduin/env/bin/activate\n")
    sh_script.write("time python /hpc/cog_bioinf/data/sverduin/CADDtoVCF_processing.py -i /hpc/cog_bioinf/data/sverduin/chunks/%s -c %s -t %s >> /hpc/cog_bioinf/data/sverduin/chunks/%s" % (files, inputcadd, threads, files+".vcf"))

# Submit to server. 
  for files in glob.glob("/hpc/cog_bioinf/data/sverduin/chunk_*.sh"):
   os.system("qsub -cwd -q short -M sverduin@umcutrecht.nl -m a -pe threaded %s %s" % (threads, files))
    
  #os.system(<hold until every chunk (named X) has completed, then run script X)
if __name__ == "__main__":
  inputcadd,vcffile,threads = main(sys.argv[1:])
  CADDtoHeader(inputcadd,vcffile,threads)

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).