Demo entry 6682267

Solution

   

Submitted by anonymous on Dec 08, 2017 at 01:23
Language: Python. Code size: 1.7 kB.

def get_duplicate_files(root_dir):
    hash_to_paths= defaultdict(list)
    get_duplicate_files_helper(root_dir, hash_to_paths)
    return [paths for _, paths in hash_to_paths.iteritems() if len(paths) > 1]


def get_duplicate_files_helper(root_dir, hash_to_paths, computed_paths=set(), size_to_paths={}):
    
    for sub_dir in list_dir(root_dir):
        subdir_path = join_path(root_dir, sub_dir)
        
        if is_dir(subdir_path):
            get_duplicate_files_helper(subdir_path, hash_to_paths, computed_paths, size_to_paths)            
        else:
            file_path = subdir_path
            size = get_file_size(file_path)

            # If we have previously encountered any files of the same size,
            # we go through them one at a time, compute their hashes if necessary
            # and append them to @hash_to_paths. This action in and of itself will
            # cause file paths with equal hashes to be grouped together by hash
            # in @hash_to_paths.
            if size in size_to_paths:
                hash_ = compute_hash(file_path)
                hash_to_paths[hash_].append(file_path)
                compute_hash.add(file_path)
                
                for same_size_path in size_to_paths[size]:
                    if same_size_path not in computed_paths:
                        same_path_hash = compute_hash(same_size_path)                        
                        hash_to_paths[same_path_hash].append(same_size_path)
                        computed_paths.add(same_size_path)
                        
            # In any case, we need to remember the size of the current file.
            size_to_paths[size].append(file_path)

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).