Demo entry 6760316

Data extraction

   

Submitted by Rebeca on Sep 12, 2018 at 16:26
Language: Python 3. Code size: 15.6 kB.

import numpy as np
import pandas as pd
from glob import glob
from PIL import Image
from scipy import signal as si
from random import shuffle
import itertools
from sklearn import metrics
import matplotlib.pyplot as plt

def hist_match(source, template):
    """
    Adjust the pixel values of a grayscale image such that its histogram
    matches that of a target image

    Arguments:
    -----------
        source: np.ndarray
            Image to transform; the histogram is computed over the flattened
            array
        template: np.ndarray
            Template image; can have different dimensions to source
    Returns:
    -----------
        matched: np.ndarray
            The transformed output image
    """

    oldshape = source.shape
    source = source.ravel()
    template = template.ravel()

    # get the set of unique pixel values and their corresponding indices and
    # counts
    s_values, bin_idx, s_counts = np.unique(source, return_inverse=True,
                                            return_counts=True)
    t_values, t_counts = np.unique(template, return_counts=True)

    # take the cumsum of the counts and normalize by the number of pixels to
    # get the empirical cumulative distribution functions for the source and
    # template images (maps pixel value --> quantile)
    s_quantiles = np.cumsum(s_counts).astype(np.float64)
    s_quantiles /= s_quantiles[-1]
    t_quantiles = np.cumsum(t_counts).astype(np.float64)
    t_quantiles /= t_quantiles[-1]

    # interpolate linearly to find the pixel values in the template image
    # that correspond most closely to the quantiles in the source image
    interp_t_values = np.interp(s_quantiles, t_quantiles, t_values)

    return interp_t_values[bin_idx].reshape(oldshape)


def get(nb_images=None, train_percent= 80, train_data_path='./data/train/'):
    """Format and return the ISIC data as np arrays."""
    # Loading the data
    # Inputs
    treshold = int(nb_images*train_percent/100)
    print(treshold, 'images will be used for the train set,', nb_images-treshold, 'images will be used to the validation set')
    print('Loading inputs...')
    data = {}
    for f in glob(train_data_path + '*.jpg')[:nb_images]:
        image = Image.open(f)
        idx = f.split('/')[-1].split('.jpg')[0]
        data[idx] = np.array(image)

    # Labels
    print('Loading labels...')
    Dp = pd.read_csv(train_data_path + 'labels.csv').set_index('image')

    # Aligning inputs and labels
    print('Aligning inputs and labels...')
    X_train = []
    y_train = []
    X_val= []
    y_val = []
    for counter, value in enumerate (data.items(),1):
        key = value[0]
        val = value[1]
        try:
            if  counter <= treshold :
                y_train.append(np.array(Dp.loc[key]))
                X_train.append(val)
            else :
                y_val.append(np.array(Dp.loc[key]))
                X_val.append(val)
        except ValueError:
            print(key)
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)

if __name__ == '__main__':
    X_train, y_train = get()
    print('Size of train inputs: ', X_train.shape)
    print('Size of train labels: ', y_train.shape)
    print('Size of validation inputs: ', X_val.shape)
    print('Size of validation labels: ', y_val.shape)
    
    
def get_bis(train_data_dir, train_data_list,train_percent= 80):
    """Format and return the ISIC data as np arrays."""
    # Loading the data
    # Shuffling the train_data_list
    shuffle(train_data_list)
    # Inputs
    nb_images=len(train_data_list)
    treshold = int(nb_images*train_percent/100)
    print(treshold, 'images will be used for the train set,', nb_images-treshold, 'images will be used to the validation set')
    print('Loading inputs...')
    data = {}
    print('train_data_list is', type(train_data_list))
    for f in train_data_list:
        image =Image.open(train_data_dir + str(f) +'.jpg')
        idx = f.split('/')[-1].split('.jpg')[0]
        data[idx] = np.array(image)
    print(len(data))
    # Labels
    print('Loading labels...')
    Dp = pd.read_csv(train_data_dir+ 'labels.csv').set_index('image')
    # Aligning inputs and labels
    print('Aligning inputs and labels...')
    X_train = []
    y_train = []
    X_val= []
    y_val = []
    for counter, value in enumerate (data.items(),1):
        key = value[0]
        val = value[1]
        try:
            if  counter <= treshold :
                y_train.append(np.array(Dp.loc[key]))
                X_train.append(val)
            else :
                y_val.append(np.array(Dp.loc[key]))
                X_val.append(val)
        except ValueError:
            print(key)
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)


def get_gray(train_type= 'gray', train_percent= 80, nb_images=None, train_data_path='./data/train/'):
    """Format and return the ISIC data as np arrays for the grey images."""
    # Loading the data
    # Inputs
    treshold = int(nb_images*train_percent/100)
    print(treshold, 'images will be used for the train set,', nb_images-treshold, 'images will be used to the validation set')
    print('Loading inputs...')
    data = {}
    for f in glob(train_data_path + '*.jpg')[:nb_images]:
        image = Image.open(f)
        image_gray = image.convert('L')
        image_featured = np.array(image_gray)
        idx = f.split('/')[-1].split('.jpg')[0]
        data[idx] = image_featured
    # Labels
    print('Loading labels...')
    Dp = pd.read_csv(train_data_path + 'labels.csv').set_index('image')
    # Aligning inputs and labels
    print('Aligning inputs and labels...')
    X_train = []
    y_train = []
    X_val= []
    y_val = []
    for counter, value in enumerate (data.items(),1):
        key = value[0]
        val = value[1]
        try:
            if  counter <= treshold :
                y_train.append(np.array(Dp.loc[key]))
                X_train.append(val)
            else :
                y_val.append(np.array(Dp.loc[key]))
                X_val.append(val)
        except ValueError:
            print(key)
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)


def get_convolution(train_type= 'all', nb_images=None, train_data_path='./data/train/'):
    """Format and return the ISIC data as np arrays for convolution."""
    #creating the Sobel kernels for convolution
    ker0 = np.array([ [-1, -2, -1], [0, 0, 0], [1, 2, 1]])
    ker45 = np.array([ [-1, -2, 0], [-2, 0, 2], [0, 2, 1]])
    ker90 = np.array([ [-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
    ker135 = np.array([ [0, -2, 1], [-2, 0, 2], [-1, 2, 0]])
    #Create the gaussian kernel
    x, y = np.meshgrid(np.linspace(-1,1,10), np.linspace(-1,1,10))
    d = np.sqrt(x*x+y*y)
    sigma, mu = 1000.0, 0.0
    gaussian_kernel = np.exp(-( (d-mu)**2 / ( 2.0 * sigma**2 ) ) )
    # Loading the data
    # Inputs
    print('Loading inputs...')
    data = {}
    for f in glob(train_data_path + '*.jpg')[:nb_images]:
        image = Image.open(f)
        image_gray = image.convert('L')
        #If a double horizontal convolution is asked
        if train_type == 'DoubleHorizontal' :
            #First convolution
            conv0_same = si.convolve2d(image_gray, ker0, mode='same')
            #Second convolution
            conv0_same_bis = si.convolve2d(conv0_same, gaussian_kernel, mode='same')
            image_featured = conv0_same_bis
        #If an all-directions double convolution is asked
        elif train_type == 'all' :
            #First convolution
            conv0_same = si.convolve2d(image_gray, ker0, mode='same')
            conv45_same = si.convolve2d(image_gray, ker45, mode='same')
            conv90_same = si.convolve2d(image_gray, ker90, mode='same')
            conv135_same = si.convolve2d(image_gray, ker135, mode='same')
            #Second convolution
            conv0_same_bis = si.convolve2d(conv0_same, gaussian_kernel, mode='same')
            conv45_same_bis = si.convolve2d(conv45_same, gaussian_kernel, mode='same')
            conv90_same_bis = si.convolve2d(conv90_same, gaussian_kernel, mode='same')
            conv135_same_bis = si.convolve2d(conv135_same, gaussian_kernel, mode='same')
            #Attribution
            image_featured = conv0_same_bis, conv45_same_bis, conv90_same_bis, conv135_same_bis
        else :
            image_featured = np.array(image_gray)
        print(f.split('/')[-1])
        idx = f.split('/')[-1].split('.jpg')[0]
        data[idx] = image_featured

    # Labels
    print('Loading labels...')
    Dp = pd.read_csv(train_data_path + 'labels.csv').set_index('image')

    # Aligning inputs and labels
    print('Aligning inputs and labels...')
    X = []
    y = []
    for key, val in data.items():
        try:
            y.append(np.array(Dp.loc[key]))
            X.append(val)
        except ValueError:
            print(key)
    return np.array(X), np.array(y)


if __name__ == '__main__':
    X_train, y_train = get()
    print('Size of train inputs: ', X_train.shape)
    print('Size of train labels: ', y_train.shape)

    
def get_normalized(nb_images=None, train_percent= 80, train_data_path='./data/train/'):
    """Format and return the ISIC data as np arrays."""
    # Loading the data
    # Inputs
    treshold = int(nb_images*train_percent/100)
    print(treshold, 'images will be used for the train set,', nb_images-treshold, 'images will be used to the validation set')
    print('Loading inputs...')
    data = {}
    for f in glob(train_data_path + '*.jpg')[:nb_images]:
        image = Image.open(f)
        idx = f.split('/')[-1].split('.jpg')[0]
        data[idx] = np.array(image)

    # Labels
    print('Loading labels...')
    Dp = pd.read_csv(train_data_path + 'labels.csv').set_index('image')

    # Aligning inputs and labels
    print('Aligning inputs and labels...')
    X_train = []
    y_train = []
    X_val= []
    y_val = []
    for counter, value in enumerate (data.items(),1):
        key = value[0]
        val = value[1]
        val[:,:,0] = hist_match(val[:,:,0], val[:,:,1])
        try:
            if  counter <= treshold :
                y_train.append(np.array(Dp.loc[key]))
                X_train.append(val)
            else :
                y_val.append(np.array(Dp.loc[key]))
                X_val.append(val)
        except ValueError:
            print(key)
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)


def get_bis_normalized(train_data_dir, train_data_list,train_percent= 80):
    """Format and return the ISIC data as np arrays."""
    # Loading the data
    # Shuffling the train_data_list
    shuffle(train_data_list)
    # Inputs
    nb_images=len(train_data_list)
    treshold = int(nb_images*train_percent/100)
    print(treshold, 'images will be used for the train set,', nb_images-treshold, 'images will be used to the validation set')
    print('Loading inputs...')
    data = {}
    print('train_data_list is', type(train_data_list))
    for f in train_data_list:
        image =Image.open(train_data_dir + str(f) +'.jpg')
        idx = f.split('/')[-1].split('.jpg')[0]
        data[idx] = np.array(image)
    print(len(data))
    # Labels
    print('Loading labels...')
    Dp = pd.read_csv(train_data_dir+ 'labels.csv').set_index('image')
    # Aligning inputs and labels
    print('Aligning inputs and labels...')
    X_train = []
    y_train = []
    X_val= []
    y_val = []
    for counter, value in enumerate (data.items(),1):
        key = value[0]
        val = value[1]
        val[:,:,0] = hist_match(val[:,:,0], val[:,:,1])
        try:
            if  counter <= treshold :
                y_train.append(np.array(Dp.loc[key]))
                X_train.append(val)
            else :
                y_val.append(np.array(Dp.loc[key]))
                X_val.append(val)
        except ValueError:
            print(key)
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)

def pred_class(y, trunked = False):
    "When y is a list of predictions, this functions finds the class predicted"
    #Returns 2 Dataframes : 
    ##class_index (list of index predicted)
    ##class_name (list of diseases predicted) 
    # And returns a vector class_max = [Name of the most common class, percentage of this class]
    class_name = [None] * len(y)
    class_index = [None] * len(y)
    y_binary = np.zeros(y.shape)
    if trunked == False : 
        for i in range (len(y)):
            val = np.argmax(y[i])
            class_index[i] = val + 1
            if val == 0 :
                class_name[i] = 'MEL'
                y_binary[i,0] = 1
            elif val == 1 :
                class_name[i] = 'NV'
                y_binary[i,1] = 1
            elif val == 2 :
                class_name[i] = 'BCC'
                y_binary[i,2] = 1
            elif val == 3 :
                class_name[i] = 'AKIEC'
                y_binary[i,3] = 1
            elif val == 4 :
                class_name[i] = 'BKL'
                y_binary[i,4] = 1
            elif val == 5 :
                class_name[i] = 'DF'
                y_binary[i,5] = 1
            else :
                class_name[i] = 'VASC'
                y_binary[i,6] = 1
    else : #trunked == True
        for i in range (len(y)):
            val = np.argmax(y[i])
            class_index[i] = val + 1
            if val == 0 :
                class_name[i] = 'MEL'
                y_binary[i,0] = 1
            elif val == 1 :
                class_name[i] = 'NV'
                y_binary[i,1] = 1
            else :
                class_name[i] = 'BKL'
                y_binary[i,2] = 1
    class_index = pd.DataFrame(class_index, columns =['Class index'])
    class_name = pd.DataFrame(class_name, columns =['Class name'])
    overview = class_name.apply(pd.value_counts)
    class_max = [overview.idxmax()[0], 100*overview.max()[0]/overview.sum()[0]]
    return class_index, class_name, class_max, y_binary

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def wrong_prediction(y_true, y_predicted):
    "This function returns the index that correspond to a wrong prediction"
    
    pred = np.sum((y_true == y_predicted) * [1], axis=1)
    indices = [x for x,y in enumerate(pred) if y !=3] # indices dont la prediction est fausse
    true_pred = y_true[indices]
    false_pred = y_predicted[indices]
    return indices, true_pred, false_pred

This snippet took 0.03 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).