Submitted by anonymous on Dec 08, 2017 at 01:23
Language: Python. Code size: 1.7 kB.

def get_duplicate_files(root_dir):
    hash_to_paths= defaultdict(list)
    get_duplicate_files_helper(root_dir, hash_to_paths)
    return [paths for _, paths in hash_to_paths.iteritems() if len(paths) > 1]

def get_duplicate_files_helper(root_dir, hash_to_paths, computed_paths=set(), size_to_paths={}):
    for sub_dir in list_dir(root_dir):
        subdir_path = join_path(root_dir, sub_dir)
        if is_dir(subdir_path):
            get_duplicate_files_helper(subdir_path, hash_to_paths, computed_paths, size_to_paths)            
            file_path = subdir_path
            size = get_file_size(file_path)

            # If we have previously encountered any files of the same size,
            # we go through them one at a time, compute their hashes if necessary
            # and append them to @hash_to_paths. This action in and of itself will
            # cause file paths with equal hashes to be grouped together by hash
            # in @hash_to_paths.
            if size in size_to_paths:
                hash_ = compute_hash(file_path)
                for same_size_path in size_to_paths[size]:
                    if same_size_path not in computed_paths:
                        same_path_hash = compute_hash(same_size_path)                        
            # In any case, we need to remember the size of the current file.

