Demo entry 4437928

en

   

Submitted by anonymous on Apr 11, 2016 at 09:39
Language: Python. Code size: 14.5 kB.

import xlrd
from math import *
from fisher import pvalue
from numpy import array, empty

class TFs_DataBase(object): 

    """ Holds information from the main database. 
    
    Members: 
    TF_Names            Name of TF entry, e.g. AATF.
    Approved_symbol     Approved name of TF entry, e.g. AATF.
    UniProt_IDs         Uniprot ID of TF, e.g AATF_HUMAN.

    All_Domains         List of all Pfam_A domains in database.

    DB_TFs              List DNA Binding TFs in database.
    PPI_TFs             List Protein-Protein Interaction TFs in database.
    Experimented_PPI_TFs List Experimented PPI by Ravasi et al.

    Phosphorylation_TFs List PHOSPHORYLATION TFs in database.
    Ubiquitination_TFs  List UBIQUITINATION TFs in database.
    Methylation_TFs     List METHYLATION TFs in database.
    Acetylation_TFs     List ACETYLATION TFs in database.
    Sumoylation_TFs     List SUMOYLATION TFs in database.
    O_GlcNAc_TFs        List O-GlcNAc TFs in database.

    """ 
    def __init__(self): 

        self.TF_Names = []
        self.Approved_symbol = []
        self.UniProt_IDs = []
            
        self.All_Domains = []

        self.DB_TFs = []
        self.PPI_TFs = []
        self.Experimented_PPI_TFs = []
            
        self.Phosphorylation_TFs = []
        self.Ubiquitination_TFs = []
        self.Methylation_TFs = []
        self.Acetylation_TFs = []
        self.Sumoylation_TFs = []
        self.O_GlcNAc_TFs = []

            
DataBase = TFs_DataBase()
def TF_Names(handle_Table):
    
    for i in range(1, sh.nrows):
        DataBase.TF_Names.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.TF_Names

def Approved_symbol(handle_Table):
    
    for i in range(1, sh.nrows):
        DataBase.Approved_symbol.append(sh.cell_value(rowx = i, colx = 1))
    return DataBase.TF_Names


def UniProt_IDs(handle_Table):
    
    for i in range(1, sh.nrows):
        DataBase.UniProt_IDs.append(sh.cell_value(rowx = i, colx = 2))
    return DataBase.UniProt_IDs


def All_Domains(handle_Table):
    
    for i in range(1, sh.nrows):
        if sh.cell_value(rowx = i, colx = 7) != '':
            DBD = sh.cell_value(rowx = i, colx = 7).split('; ')
            for name in DBD:
                DataBase.All_Domains.append(name.split('   ')[0])
        if sh.cell_value(rowx = i, colx = 8) != '':
            N_DBD = sh.cell_value(rowx = i, colx = 8).split('; ')
            for name in N_DBD:
                if 'Pfam-B' not in name:
                    DataBase.All_Domains.append(name.split('   ')[0])
    return list(set(DataBase.All_Domains))


def DB_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if sh.cell_value(rowx = i, colx = 6) == '+':
            DataBase.DB_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.DB_TFs


def PPI_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if sh.cell_value(rowx = i, colx = 9) == '+':
            DataBase.PPI_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.PPI_TFs


def Experimented_PPI_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if sh.cell_value(rowx = i, colx = 9) == '+'\
           or sh.cell_value(rowx = i, colx = 9) == '-':
              DataBase.Experimented_PPI_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.Experimented_PPI_TFs


def Phosphorylation_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if 'PHOSPHORYLATION' in sh.cell_value(rowx = i, colx = 11):
            DataBase.Phosphorylation_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.Phosphorylation_TFs


def Ubiquitination_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if 'UBIQUITINATION' in sh.cell_value(rowx = i, colx = 11):
            DataBase.Ubiquitination_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.Ubiquitination_TFs


def Methylation_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if 'METHYLATION' in sh.cell_value(rowx = i, colx = 11):
            DataBase.Methylation_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.Methylation_TFs


def Acetylation_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if 'ACETYLATION' in sh.cell_value(rowx = i, colx = 11):
            DataBase.Acetylation_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.Acetylation_TFs


def Sumoylation_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if 'SUMOYLATION' in sh.cell_value(rowx = i, colx = 11):
            DataBase.Sumoylation_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.Sumoylation_TFs


def O_GlcNAc_TFs(handle_Table):
    
    for i in range(1, sh.nrows):
        if 'O-GlcNAc' in sh.cell_value(rowx = i, colx = 11):
            DataBase.O_GlcNAc_TFs.append(sh.cell_value(rowx = i, colx = 0))
    return DataBase.O_GlcNAc_TFs


def Properties():
    
    """ Return a list of all TFs with specific properties from the database
    (8 properties including: DNA-Binding, PPI, Phosphorylation, Ubiquitination,
    Methylation, Acetylation, Sumoylation, O_GlcNAc respectively).
    """
    List_Names = []
    List_Names.append(DB_TFs(handle_Table))
    List_Names.append(PPI_TFs(handle_Table))
    List_Names.append(Phosphorylation_TFs(handle_Table))
    List_Names.append(Ubiquitination_TFs(handle_Table))
    List_Names.append(Methylation_TFs(handle_Table))
    List_Names.append(Acetylation_TFs(handle_Table))
    List_Names.append(Sumoylation_TFs(handle_Table))
    List_Names.append(O_GlcNAc_TFs(handle_Table))
    return List_Names


def Check_By_Genenames(Gene_List):

    """ Checking for a list of input genes or proteins with different existing
    names in the database (e.g. Uniprot ID, Gene names, ...). Return a modified
    list of input based on existing names.   
    """
    Modify_Gene_List = []    
    for i in range(len(Gene_List)):
        i_ = 0
        for j in range(1, sh.nrows):            
            if Gene_List[i] == sh.cell_value(rowx = j, colx = 0)\
               or Gene_List[i] == sh.cell_value(rowx = j, colx = 1)\
               or Gene_List[i] == sh.cell_value(rowx = j, colx = 2)\
               or Gene_List[i] == sh.cell_value(rowx = j, colx = 2)[:-6]\
               or Gene_List[i] in sh.cell_value(rowx = j, colx = 3).split('; '):
                    Modify_Gene_List.append(sh.cell_value(rowx = j, colx = 0))
                    i_ += 1
                    break     
	if i_ == 0:
             Modify_Gene_List.append(Gene_List[i])
    return Modify_Gene_List


def Info_from_confusion_matrix(cm):

    """ Return Observed, Expected, Pvalue, and Mcc from confusion_matrix in
    overrepresentation analysis.
    """   
    _Info = []
    _Info.append(cm[0])               
    _Info.append((cm[0]+cm[1]) * (cm[0]+cm[2])/( cm[0]+cm[1]+cm[2]+cm[3] )) 
    p = pvalue(cm[0], cm[1], cm[2], cm[3])
    _Info.append(p.two_tail)            
    M = (cm[0]+cm[1]) * (cm[2]+cm[3]) * (cm[0]+cm[2]) * (cm[1]+cm[3])
    if M != 0:
        _Info.append(float( (cm[0]*cm[3]) - (cm[1]*cm[2]) )/sqrt(M))                 
    else:
        _Info.append('-')
    return _Info


def Benjamini_Correction(Pvalues):

    """ Return corrected Pvalues (Benjamini) for a list of Pvalues. """
    Pvalues = array(Pvalues) 
    n = float(Pvalues.shape[0])                                                                           
    New_Pvalues = empty(n)                                                    
    values = [ (pvalue, i) for i, pvalue in enumerate(Pvalues) ]                                      
    values.sort()
    values.reverse()                                                                                  
    new_values = []
    for i, vals in enumerate(values):                                                                 
        rank = n - i
        pvalue, index = vals                                                                          
        new_values.append((n/rank) * pvalue)
    for i, vals in enumerate(values):
        pvalue, index = vals
        New_Pvalues[index] = new_values[i]
    return New_Pvalues

def Final_List(List_Info_Overrepresentations):
    
    """ Return a list of all significant overrepresentation properties
    and Pfam domains with Benjamini correction.

    """
    Pvalues = [ List_Info_Overrepresentations[i][3]\
                for i in range(len(List_Info_Overrepresentations)) ]
    
    Benjamini = Benjamini_Correction(Pvalues)
    Index_Sorted_Pvalues = sorted( range(len(Pvalues)), key = lambda k: Pvalues[k] )
    Sorted_Benjamini = [ Benjamini[i] for i in  Index_Sorted_Pvalues ]
    Info_with_Benjamini = [ List_Info_Overrepresentations[Index_Sorted_Pvalues[i]]\
                            for i in range(len(Index_Sorted_Pvalues)) ]
    
    # Adding  corrected Benjamini 
    for i in range(len(Info_with_Benjamini)):
        Info_with_Benjamini[i].insert(4, Sorted_Benjamini[i])
        
    # Sorting and filtering based on Pvalues
    Benjamini_Final_List = [  Info_with_Benjamini[i] for i in range(len(Info_with_Benjamini))\
                              if Info_with_Benjamini[i][4] < 0.05  ]
    return Benjamini_Final_List


def _Print(Sorted_List):

    """ To pretty print the significant overrepresentation items. """
    
    for i in range(len(Sorted_List)):
        print "{0:<20} {1:<24} {2:<8} {3:<8} {4:<11} {5:<11} {6:<10}"\
              .format(
                      ''*20, Sorted_List[i][0],\
                      Sorted_List[i][1],\
                      Sorted_List[i][2],\
                      '%.2E' %Sorted_List[i][3],\
                      '%.2E' %Sorted_List[i][4],\
                      '%.3f' %Sorted_List[i][5]\
                     )
        
# Initial lists of database for analysis
print 
Path_Reference = raw_input('Please enter the path of the reference database (xls, with title row): ')
handle_Table = xlrd.open_workbook(Path_Reference)
sh = handle_Table.sheet_by_index(0)
List_Name_Properties = [
                          'DNA_Binding',\
                          'PPI',\
                          'Phosphorylation',\
                          'Ubiquitination',\
                          'Methylation',\
                          'Acetylation',\
                          'Sumoylation',\
                          'O_GlcNAc'
                        ]  # List of the property names

List_Properties = Properties()                          # List of the properties
Background_TFs = TF_Names(handle_Table)                 # List of all TFs in database
all_domain = All_Domains(handle_Table)                  # List all Pfam_A domains
Background_PPI = Experimented_PPI_TFs(handle_Table)     # List of TFs for experimental PPIs

#List Genes or Proteins from the Test set for analysis
Path_Test_Set = raw_input('Please enter the path of the test set (xls, with title row): ')
Number_Col = raw_input('Please enter the column number for TFs in your table (starting at 0): ')
External_Data = xlrd.open_workbook(Path_Test_Set)
Sheet_Names = External_Data.sheet_names()               # List of the gene/protein list names
List_External_Data = []                                 # List of the gene/protein lists
for i_sheet in range(External_Data.nsheets):
    Sheet_book_i = []
    sh_ = External_Data.sheet_by_index(i_sheet)
    for j in range(1, sh_.nrows):
        Sheet_book_i.append(sh_.cell_value(rowx = j, colx = int(Number_Col)))
    Sheet_book_i = Check_By_Genenames(Sheet_book_i)
    List_External_Data.append(Sheet_book_i)

print '\n\n'

# Print headline
print "{0:<20} {1:<24} {2:<8} {3:<8} {4:<11} {5:<11} {6:<10}"\
      .format( 'Category', 'Term', 'Observed', 'Expected', 'Pvalue', 'Benjamini', 'MCC' ) 
print "-"*93

# Overrepresentation analysis
for i in range(len(List_External_Data)):
    
    Per_info_Properties = []
    Per_info_Domains = []

    # Overrepresentation of properties (DB/PPI/Individual PTMs)
    for j in range(len(List_Properties)):

        cm = confusion_matrix = [0, 0, 0, 0]
        # Needs to setup for PPI property separately
        if List_Name_Properties[j] == 'PPI':
                Background = Background_PPI
        else:
                Background = Background_TFs
        cm[0] += len( set(Background) & set(List_External_Data[i]) & set(List_Properties[j]) )
        cm[1] += len( (set(Background)&set(List_External_Data[i])) - set(List_Properties[j]) )
        cm[2] += len( (set(Background)&set(List_Properties[j])) - set(List_External_Data[i]) )
        cm[3] += len( set(Background)-(set(List_Properties[j]) | set(List_External_Data[i])) )
        Per_Property = Info_from_confusion_matrix(cm)
        Per_Property.insert(0, List_Name_Properties[j])
        Per_info_Properties.append(Per_Property)
        
    # Overrepresentation of Pfam domains            
    for domain in all_domain:
        Per_Domain = []	 
        cm = confusion_matrix = [0, 0, 0, 0]
        for i_sh in range(1, sh.nrows):
            if sh.cell_value(rowx = i_sh, colx = 0) in List_External_Data[i]:
                if domain in sh.cell_value(rowx = i_sh, colx = 7)\
                   or domain in sh.cell_value(rowx = i_sh, colx = 8):
			cm[0] += 1
                else:
			cm[2] += 1
            else:
                if domain in sh.cell_value(rowx = i_sh, colx = 7)\
                   or domain in sh.cell_value(rowx = i_sh, colx = 8):
			cm[1] += 1
                else:
			cm[3] += 1
	Per_Domain = Info_from_confusion_matrix(cm)
        Per_Domain.insert(0, domain)
        Per_info_Domains.append(Per_Domain)
        
    F_Properties = Final_List(Per_info_Properties)
    F_Domains = Final_List(Per_info_Domains)
    
    # Print final result
    print Sheet_Names[i]
    if F_Properties == [] and F_Domains == []:
            print ' '*21 + 'There is no overrepresented item for this Gene/Protein list'
    if F_Properties != []:
        if F_Domains != []:
            _Print(F_Properties) 
            print ' '*21 +  '-'*10
            _Print(F_Domains)
        else:
            _Print(F_Properties)
    else:
        _Print(F_Domains) 
    print

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).