数据安全检索

大数据安全与隐私这门课的lab4,要求如下图:

avatar

实现:
• 对原始数据集进行预处理后, 计算平均年龄
• 调用了mondrian库来实现k匿名,并计算了k匿名后的平均年龄
• 通过添加拉普拉斯噪声来实现差分隐私发布
• 随机删除一条数据后再次计算平均年龄进行对比

结果

avatar

代码

结构:
avatar

main.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import copy
import random
import sys

from secure.lab3 import utils

sys.path.append('./')
from utils import preprocess, txt_Reader
from library.mondrian import *
filename = "adult.data.txt"
title_column = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
QI_list = ['age', 'sex', 'race']
data = txt_Reader(filename).read_txt('./Adult Data Set/', title_column)
data.sort_values(by='age', ascending=True, inplace=True)
rawlen = len(data)
preprocess(data) # 删除非法数据
print("原有数据有{}行; 清洗后的数据有{}行".format(rawlen, len(data)))
raw_data_list = utils.df2list(data) # 转换为列表
pre_ages = [int(item[0]) for item in raw_data_list]

avg_pre_ages = (sum(pre_ages) / len(raw_data_list)) # 求原数据平均年龄
k = int(input("需要匿名化的k为"))

DATA, order = utils.read_data()
res, b = mondrian(DATA, k, False) # k-匿名
res = utils.covert_to_raw(res, order)
post_ages = [item[0] for item in res]
avg_post_ages = utils.cal_post_ages(post_ages)
utils.write_result(res, k)
print("原数据的平均年龄为: %f" %avg_pre_ages)
print("%d-匿名后的平均年龄为:"%k, avg_post_ages)

dp_ages = utils.diff_privacy_add_laplace_noise(pre_ages, 0, 1)
avg_dp_ages = utils.avg_ages(dp_ages)
print("差分隐私后的平均年龄为: %f" %avg_dp_ages)


# 尝试在删除某条数据后,k匿名发布的平均年龄、真实发布平均年龄、差分隐私平均年龄对用户隐私信息(年龄) 泄露的可能性

idx = random.randint(0, len(pre_ages))
co_pre_ages = copy.deepcopy(pre_ages)
co_post_ages = copy.deepcopy(post_ages)
co_dp_ages = copy.deepcopy(dp_ages)

co_pre_ages.pop(idx)
co_post_ages.pop(idx)
co_dp_ages = utils.diff_privacy_add_laplace_noise(co_pre_ages, 0, 1)

print("=====================================================================================")
print("随机删除的数据为", pre_ages[idx])
avg_pre = utils.avg_ages(co_pre_ages)
avg_post = utils.cal_post_ages(co_post_ages)
avg_dp = utils.avg_ages(co_dp_ages)
print("平均年龄分别为原数据{}、k-匿名后数据{}、差分隐私后数据{}".format(avg_pre, avg_post, avg_dp))

val_pre = avg_pre_ages * (len(co_pre_ages) + 1) - avg_pre * len(co_pre_ages)
val_post = avg_post_ages * (len(co_post_ages) + 1) - avg_post * len(co_post_ages)
val_dp = avg_dp_ages * (len(co_dp_ages) + 1) - avg_dp * len(co_dp_ages)
print("随机删除的用户年龄数据分别为:原数据推断年龄{}、k-匿名后数据推断年龄{}、差分隐私后数据推断年龄{}".format(val_pre, val_post, val_dp))

utils.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import time
from datetime import datetime
import numpy as np
import pandas as pd

AGE_CONF = './hierarchy/age_hierarchy.txt'
WORKCLASS_CONF = './hierarchy/workclass_hierarchy.txt'
EDU_CONF = './hierarchy/education_hierarchy.txt'
EDUNUM_CONF = './hierarchy/edunum_hierarchy.txt'
MARITAL_CONF = './hierarchy/martial_hierarchy.txt'
RELATIONSHIP_CONF = './hierarchy/relationship_hierarchy.txt'
RACE_CONF = './hierarchy/race_hierarchy.txt'
SEX_CONF = './hierarchy/sex_hierarchy.txt'
HPW_CONF = './hierarchy/hours_per_week_hierarchy.txt'
COUNTRY_CONF = './hierarchy/country_hierarchy.txt'

title_column = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']

QI_INDEX = [0, 1, 4, 5, 6, 8, 9, 13]
IS_CAT = [False, True, False, True, True, True, True, True]
SA_INDEX = -1
__DEBUG = False
INTUITIVE_ORDER = None


def preprocess(load_data: pd.DataFrame):
load_data.replace(' ?', np.nan, inplace=True)
load_data.dropna(axis=0, how='any', inplace=True)

def read_data():

QI_num = len(QI_INDEX)
data = []
intuitive_dict = []
intuitive_order = []
intuitive_number = []
for i in range(QI_num):
intuitive_dict.append(dict())
intuitive_number.append(0)
intuitive_order.append(list())
data_file = open('Adult Data Set/adult.data.txt', 'rU')
for line in data_file:
line = line.strip()
# remove empty and incomplete lines
# only 30162 records will be kept
if len(line) == 0 or '?' in line:
continue
# remove double spaces
line = line.replace(' ', '')
temp = line.split(',')
ltemp = []
for i in range(QI_num):
index = QI_INDEX[i]
if IS_CAT[i]:
try:
ltemp.append(intuitive_dict[i][temp[index]])
except KeyError:
intuitive_dict[i][temp[index]] = intuitive_number[i]
ltemp.append(intuitive_number[i])
intuitive_number[i] += 1
intuitive_order[i].append(temp[index])
else:
ltemp.append(int(temp[index]))
ltemp.append(temp[SA_INDEX])
data.append(ltemp)
return data, intuitive_order



class txt_Reader():
def __init__(self, filename):
self.filename = filename

def read_txt(self, path: str, title_column: list) -> pd.DataFrame:
filepath = path + self.filename
txtlist = []
with open(filepath, encoding='gbk') as f:
for line in f:
txtlist.append(line.strip().split(","))
f.close()
return pd.DataFrame(txtlist, columns=title_column)

class xlsx_Reader():
def __init__(self, filename):
self.filename = filename

def read_xlsx(self, path: str) -> pd.DataFrame:
filepath = path + self.filename
return pd.read_excel(filepath)


def write_result(result, k):
with open("res/adult_%d_kanonymity.data" %k, "w") as f:
for line in result:
f.write(','.join(line) + '\n')


def df2list(df: pd.DataFrame) -> list:
data_array = np.array(df)
new_data_array = []
for item in data_array:
line = []
for i in item:
line.append(i.strip())
new_data_array.append(line)
return new_data_array
#return data_array.tolist()


def generate_categorical_loss_metric_map(leaves_num, hierarchies):
loss_metric_map = {attr: {} for attr in hierarchies.keys()}
print('\nleaves_num:\n', leaves_num)
for attr, vals in hierarchies.items():
loss_metric_map[attr]['*'] = 1
for v in vals:
if v in leaves_num[attr].keys():
loss_metric_map[attr][v] = (leaves_num[attr][v] - 1) / (leaves_num[attr]['*'] - 1)
else:
loss_metric_map[attr][v] = 0
return loss_metric_map


def categorical_loss_metric(qi_columns, leaves_num, hierarchies, sup):
loss_metric_map = generate_categorical_loss_metric_map(leaves_num, hierarchies)
print('\nloss_metric_map:\n', loss_metric_map)
loss_metric = 0

for attr in qi_columns:
col = qi_columns[attr].tolist()
# the loss for an attribute is the AVERAGE of the loss for all tuples
# the loss for the entire data set is the SUM of the losses for each attribute
sum_attr_lm = sum([loss_metric_map[attr][str(v)] for v in col])
loss_metric += (sum_attr_lm + sup) / (len(col) + sup)
return loss_metric


def compute_numerical_loss_metric(column):
loss = 0
# initialize lowest and highest values
if not isinstance(column[0], int): # string value, e.g., '35-40'
current_range = [int(i) for i in list(column[0].replace(' ', '').split('-'))]
lowest, highest = current_range[0], current_range[1]
else: # integer value, e.g., 37
lowest, highest = column[0], column[0]

# iterate through column
for v in column:
if not isinstance(v, int): # extract range from table content (string, e.g., '35-40')
current_range = [int(i) for i in list(v.replace(' ', '').split('-'))]
loss += current_range[1] - current_range[0]
# update lowest & highest
lowest = min(lowest, current_range[0])
highest = max(highest, current_range[1])
else: # integer value, loss is 0 here
lowest = min(lowest, v)
highest = max(highest, v)

max_range = highest - lowest
return loss / (max_range * len(column)) # average


def numerical_loss_metric(qi_columns):
loss_metric = 0
for attr in qi_columns:
col = qi_columns[attr].tolist()
# the loss for the entire data set is the SUM of the losses for each attribute
loss_metric += compute_numerical_loss_metric(col)
return loss_metric

def cmp(x, y):
if x > y:
return 1
elif x==y:
return 0
else:
return -1


def cmp_str(element1, element2):
"""
compare number in str format correctley
"""
try:
return cmp(int(element1), int(element2))
except ValueError:
return cmp(element1, element2)

def cmp_value(element1, element2):
if isinstance(element1, str):
return cmp_str(element1, element2)
else:
return cmp(element1, element2)


def value(x):
'''Return the numeric type that supports addition and subtraction'''
if isinstance(x, (int, float)):
return float(x)
elif isinstance(x, datetime):
return time.mktime(x.timetuple())
# return x.timestamp() # not supported by python 2.7
else:
try:
return float(x)
except Exception as e:
return x


def merge_qi_value(x_left, x_right, connect_str='~'):
'''Connect the interval boundary value as a generalized interval and return the result as a string
return:
result:string
'''
if isinstance(x_left, (int, float)):
if x_left == x_right:
result = '%d' % (x_left)
else:
result = '%d%s%d' % (x_left, connect_str, x_right)
elif isinstance(x_left, str):
if x_left == x_right:
result = x_left
else:
result = x_left + connect_str + x_right
elif isinstance(x_left, datetime):
# Generalize the datetime type value
begin_date = x_left.strftime("%Y-%m-%d %H:%M:%S")
end_date = x_right.strftime("%Y-%m-%d %H:%M:%S")
result = begin_date + connect_str + end_date
return result




def write_to_file(result, k):

with open("res/adult_%d_kanonymity.data" %k, "w") as output:
for r in result:
output.write(';'.join(r) + '\n')

def covert_to_raw(result, order, connect_str='~'):

covert_result = []
qi_len = len(order)
for record in result:
covert_record = []
for i in range(qi_len):
if len(order[i]) > 0:
vtemp = ''
if connect_str in record[i]:
temp = record[i].split(connect_str)
raw_list = []
for j in range(int(temp[0]), int(temp[1]) + 1):
raw_list.append(order[i][j])
vtemp = connect_str.join(raw_list)
else:
vtemp = order[i][int(record[i])]
covert_record.append(vtemp)
else:
covert_record.append(record[i])
if isinstance(record[-1], str):
covert_result.append(covert_record + [record[-1]])
else:
covert_result.append(covert_record + [connect_str.join(record[-1])])
return covert_result

def split_scale(age: str) -> float:
pos = age.find("~")
low = age[0:pos]
high = age[pos + 1:len(age)]
return (int(low) + int(high)) / 2.0


def cal_post_ages(post_ages: list) -> float:
post_sum = 0
for item in post_ages:
if "~" in item:
post_sum += split_scale(item)
else: post_sum += int(item)*1.0
return post_sum / len(post_ages)

def avg_ages(ages: list) -> float:
return sum(ages) / len(ages)


def diff_privacy_add_laplace_noise(ages: list, loc, scale):
laplace_noise = np.random.laplace(loc, scale, len(ages))
res = [ages[i] + laplace_noise[i] for i in range(len(ages))]
return res

还有蒙德里安库的代码去github上随便找一个就行