Demo entry 5406225

Output

   

Submitted by anonymous on Jun 23, 2016 at 12:45
Language: Python. Code size: 7.9 kB.

def link(df, compare, bd_flag, t, name):
	"""apply linking algorithm to sorted dataframe"""
	l = len(df)
	#counter for amount of links found
	m = 0
	#list containing all found links
	links = []
	#generate iterable containing all indices of dataframe
	toCheck = iter(range(0, l-1))
	#check record at index i
	for i in toCheck:
		#j is the index following i
		j = i+1
		#number of links found for record at index i
		n = 0
		if j < l:
			#check the next record for a link 
			while check_match_threshold(df.ix[i], df.ix[j], compare, bd_flag, t):
					#if a match is found:
					n += 1
					#record the found links as (id1, id2, block-number)
					#the block-number is the id of the record at index i
					links.append((df.ix[i]['id'], df.ix[j]['id'], df.ix[i]['id']))
					m += 1
					#increase j to inspect the next record
					j += 1
			#any records that were linked do not need to be inspected
			#they are therefore removed from toCheck
			next(islice(toCheck, n, n), None)
	#return the list containing all links
	return links

def check_match_threshold(r1, r2, f, bd_flag, t):
	"""checks if a candidate link should be accepted or rejected"""
	#a candidate link is assumed to be accepted, unless proven otherwise
	valid = True
	#if both records are of birth, birth date must match exactly
	if (r1['role'] == 1) and (r2['role'] == 1) and (r1['bdate'] != r2['bdate']):
		valid = False
	#if both records are of death, death date must match exactly
	elif (r1['role'] == 10) and (r2['role'] == 10) and (r1['ddate'] != r2['ddate']):
		valid = False
	#first name of ego must match exactly 
	elif (r1['af'] != r2['af']):
		valid = False
	elif (r1['af'] == r2['af']):
		#the amount of non-matching fields encountered
		conflict = 0 
		#for each field that is to be compared, check if fields of both records match exactly
		for field in f:
			if field != 'af':
				#if this is not the case, a conflict is recorded
				if (r1[field] != r2[field]):
					conflict += 1
		#if the amount of conflict is larger then the set treshhold t, the link is rejected
		if conflict > t:
			valid = False
	#if a procedure considers birth date
	if bd_flag:
		#if one birth date is missing, the link is rejected
		if math.isnan(r1['bd'].item()) or math.isnan(r2['bd'].item()):
			valid = False
		#a link is rejected if estimated birth dates differ more then 400 days
		elif (abs(r1['bd'].item()-r2['bd'].item()) > 400):
			valid = False
	#return wether the link is rejected (False) or accepted (True)
	return valid

def proc(name, query, compare, bd_flag, t):
	"""" executes an entire linking-procedure
	name is the user-defined code for the procedure
	query is the SQL statement defining which fields (and which manipulations) are used
	compare is a list containing all fields that should be considered when establishing links
	bd_flag is set to True if birth dates should be considered, False otherwise
	t is the threshhold for non-matching fields"""

	#construct a dataframe and sort on selected fields
	df = readsort(query, familyfront)
	
	#generate a list of links
	matches = linkAlt(df, compare, bd_flag, t, name)
	
	#store the links as a python-object for further use and as a csv
	picklewrite(matches, name)

def readsort(query,sorting):	
	"""construct and sort a dataframe"""
	#construct a dataframe based on SQL-statement
	engine = create_engine('mysql://jelte:wiskunde@localhost/zl_work')
	df = sql.read_sql(query, engine)
	#sort the dataframe using defined sorting-order
	sorted_df = df.sort_values(sorting)
	#re-index the sorted dataframe
	sorted_df = sorted_df.reset_index(drop=True)
	#append columns for fields which are not used in sorting
	sorted_df_n = sorted_df.reindex_axis(sorting+['role','id','bdate','ddate'], axis=1)
	#return the sorted dataframe
	return sorted_df_n

def procAnalyse(proc, base, fieldlist):
	"""executes analysis of links
	   proc is the name of the procedure to be analysed
	   base is the name of the procedure of which links should be subtracted from proc
	   
	   procedure 1-1 is often chosen as base, since this procedures only generates
	   links where all original fields match exactly. This leaves only links where some fields do not match
	   in the non-manipulated forms."""

	#a HDF table where different dataframes are stored
	store = HDFStore('/home/jelte/Scriptie/python/dataframes.h5')
	
	#reads links from both procedures and returns only links that are found in 'proc' and not in 'base'
	if base == '':
		#if not base procedure is defined, analyse all links of 'proc'
		matches = loadMatchesSingle(proc)	
	else:
		matches = loadMatches(proc,base)

	#reads a previously constructed dataframe containing relevant fields for all records in the datasets
	df = store['qbasemar']

	#analyse the links
	matchdf = analyze(df, matches, proc, base, fieldlist)

	#generate some statistics of the analysis
	difStats(matchdf, proc, base, fieldlist)

def analyze(df, matches, proc, base, fieldlist):
	"""analyzes selected links and saves information about them
	fieldlist contains all fields that are to be inspected"""
	
	#construct a new dataframe containing information about the links
	#in this dataframe is link is represented as a row and the columns
	#contain information about selected fields of both records of the link
	matchdf = createDf()
	matchdfc = matchdf
	#contruct a dictionary of fields in the analysis-dataframe. Each of these fields will record values for each link in a list
	#the dictionary is initialzed with some standard values (0 for numbers, None for text fields)
	d = {'proc':[proc]*l, 'm1':[0]*l, 'm2':[0]*l, 'al1':[None]*l,'al2':[None]*l, 'al_d':[0]*l, 'al_lv':[0]*l, 'af1':[None]*l,'af2':[None]*l, 'af_d':[0]*l, 'af_lv':[0]*l, 'ml1':[None]*l,'ml2':[None]*l, 'ml_d':[0]*l, 'ml_lv':[0]*l, 'mf1':[None]*l,'mf2':[None]*l, 'mf_d':[0]*l, 'mf_lv':[0]*l, 'fl1':[None]*l,'fl2':[None]*l, 'fl_d':[0]*l, 'fl_lv':[0]*l, 'ff1':[None]*l,'ff2':[None]*l, 'ff_d':[0]*l, 'ff_lv':[0]*l, 'pl1':[None]*l,'pl2':[None]*l, 'pl_d':[0]*l, 'pl_lv':[0]*l, 'pf1':[None]*l,'pf2':[None]*l, 'pf_d':[0]*l, 'pf_lv':[0]*l, 'bd_d':[0]*l, 'role1':[None]*l, 'role2':[None]*l}		
	#loop over all links
	for i, (a, b) in enumerate(matches):
		#find the records based on the IDs found in the link
		m1 = df[df.id==a]
		m2 = df[df.id==b]
		#for each field, set values for both records:
		for f in fieldlist:			
			#if the non-manipulated field does not match exactly:
			if m1[f].item() != m2[f].item():
				#save the value of the field for both records in the dictionary
				d[f+'1'][i] = m1[f].item()
				d[f+'2'][i] = m2[f].item()
				try:
					#save levensthein-distance between the fields of both records
					#if one name is not present, set levenshtein to None, otherwise it will count the distance to string 'null'
					if m1[f].item() == 'null' or m2[f].item() == 'null':
						d[f+'_lv'][i] = None
					else:
						d[f+'_lv'][i] = distance(m1[f].item(), m2[f].item())
				#if the name is not present it might be set as None or Nan when converting to dataframe
				#in this case the levenshtein-distance is also set to None
				except TypeError:
					d[f+'_lv'][i] = None
				#a value of 1 for field_d denotes the non-manipulated fields do not match
				d[f+'_d'][i] = 	1
		#set IDs of both records
		d['m1'][i] = a
		d['m2'][i] = b
		#set the difference between estimated birth dates of both records
		d['bd_d'][i] = np.abs(m1['bd'].item() - m2['bd'].item())
		#set the role of both records
		d['role1'][i] = m1['role'].item()
		d['role2'][i] = m2['role'].item()
	#convert the dictionary to a dataframe and append it to the one constructed
	adddf = pd.DataFrame.from_dict(d)	
	matchdf = matchdf.append(adddf,ignore_index=True)
	#return the dataframe containing information about each link	
	return matchdf.reindex_axis(matchdfc.columns, axis=1)

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).