from paddleocr import PaddleOCR
import json
from PIL import Image, ImageDraw, ImageFont
import os
import glob
# 初始化 PaddleOCR 实例
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False) # 启用文字方向检测
# 敏感信息关键字和对应的索引
sensitive_keywords = [
"号牌号码",
"车辆识别代号",
"发动机号码",
"住址",
"所有人"
]
# 遍历images目录下的所有图片文件
image_files = glob.glob("images/*.jpg") + glob.glob("images/*.png") + glob.glob("images/*.jpeg")
for image_file in image_files:
print(f"\n正在处理图片:{image_file}")
# 获取图片文件名(不包含路径和扩展名)
image_name = os.path.splitext(os.path.basename(image_file))[0]
# 对图像执行 OCR 推理
result = ocr.predict(
input=image_file)
# 可视化结果并保存 json 结果
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
# 读取识别结果
json_file = f"output/{image_name}_res.json"
with open(json_file, "r", encoding="utf-8") as f:
ocr_result = json.load(f)
# 提取敏感信息
rec_texts = ocr_result["rec_texts"]
rec_boxes = ocr_result["rec_boxes"]
# 创建敏感信息映射字典,用于存储已找到的敏感信息
found_info = {
"号牌号码": None,
"车辆识别代号": None,
"发动机号码": None,
"住址": None,
"所有人": None
}
# 1. 号码号牌:通过特定规则识别
# 规则1:查找"号牌号码"关键字,取其后面的内容
# 规则2:匹配以省份简称开头的号牌号码格式
# 特殊处理:处理行驶证正副页在一张图片上导致的重复识别,保留所有唯一的号牌号码值
# 优化:处理省份简称识别错误问题,通过容错映射和上下文验证
# 中国各省份简称列表
province_codes = ["京", "津", "冀", "晋", "蒙", "辽", "吉", "黑", "沪", "苏", "浙", "皖", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "渝", "川", "黔", "滇", "藏", "陕", "甘", "青", "宁", "新", "港", "澳", "台"]
# 省份简称容错映射:容易混淆的字映射到正确的省份简称
province_fault_tolerance = {
"相": "湘", "箱": "湘", "香": "湘",
"川": "川", "州": "川", "卅": "川",
"云": "云", "去": "云", "厶": "云",
"陕": "陕", "峡": "陕", "侠": "陕",
"鲁": "鲁", "鱼": "鲁", "卤": "鲁", "售": "鲁",
"豫": "豫", "予": "豫", "像": "豫",
"粤": "粤", "曰": "粤", "奥": "粤",
"桂": "桂", "挂": "桂", "贵": "桂",
"苏": "苏", "办": "苏", "束": "苏",
"浙": "浙", "折": "浙", "哲": "浙",
"皖": "皖", "完": "皖", "碗": "皖",
"闽": "闽", "门": "闽", "闵": "闽",
"赣": "赣", "干": "赣", "贡": "赣",
"鄂": "鄂", "e": "鄂", "哦": "鄂",
"渝": "渝", "俞": "渝", "于": "渝",
"晋": "晋", "普": "晋", "普": "晋",
"冀": "冀", "翼": "冀", "记": "冀",
"辽": "辽", "了": "辽", "疗": "辽",
"吉": "吉", "古": "吉", "告": "吉",
"黑": "黑", "里": "黑", "墨": "黑",
"沪": "沪", "户": "沪", "护": "沪",
"蒙": "蒙", "孟": "蒙", "猛": "蒙",
"津": "津", "金": "津", "今": "津",
"京": "京", "惊": "京", "景": "京",
"琼": "琼", "穷": "琼", "穹": "琼",
"宁": "宁", "宁": "宁", "柠": "宁",
"青": "青", "清": "青", "轻": "青",
"甘": "甘", "干": "甘", "敢": "甘",
"新": "新", "辛": "新", "心": "新",
"藏": "藏", "zang": "藏", "葬": "藏"
}
# 收集所有可能的号牌号码
potential_plates = []
# 规则1:查找"号牌号码"关键字
for i, text in enumerate(rec_texts):
if text == "号牌号码" and i + 1 < len(rec_texts):
# 获取潜在的车牌号码
potential_plate = rec_texts[i + 1]
# 验证并修正省份简称
if len(potential_plate) >= 6:
# 获取识别到的省份简称
identified_code = potential_plate[0]
# 检查是否为正确的省份简称
if identified_code not in province_codes:
# 如果不是,尝试容错映射
if identified_code in province_fault_tolerance:
# 使用容错映射修正省份简称
corrected_plate = province_fault_tolerance[identified_code] + potential_plate[1:]
potential_plates.append({
"value": corrected_plate,
"box": rec_boxes[i + 1]
})
else:
# 省份简称正确,直接添加
potential_plates.append({
"value": potential_plate,
"box": rec_boxes[i + 1]
})
# 规则2:直接匹配以省份简称开头的号牌号码
for i, text in enumerate(rec_texts):
if 6 <= len(text) <= 8:
# 获取识别到的省份简称
identified_code = text[0]
is_valid = False
corrected_plate = text
# 检查是否为正确的省份简称
if identified_code in province_codes:
is_valid = True
elif identified_code in province_fault_tolerance:
# 使用容错映射修正
corrected_plate = province_fault_tolerance[identified_code] + text[1:]
is_valid = True
if is_valid:
# 始终添加到potential_plates中,用于收集所有位置
potential_plates.append({
"value": corrected_plate,
"box": rec_boxes[i]
})
# 处理重复情况:保留所有唯一的号牌号码值
unique_plates = []
seen_values = set()
all_plate_boxes = [] # 用于存储所有需要打码的号牌号码位置
for plate in potential_plates:
if plate["value"] not in seen_values:
seen_values.add(plate["value"])
unique_plates.append(plate)
all_plate_boxes.append(plate["box"])
# 设置最终的号牌号码值(用于结果输出,取第一个)
if unique_plates:
found_info["号牌号码"] = {
"keyword": "号牌号码",
"value": unique_plates[0]["value"],
"box": unique_plates[0]["box"]
}
# 将所有需要打码的号牌号码位置保存到found_info中,用于后续打码
found_info["号牌号码"]["all_boxes"] = all_plate_boxes
# 2. 车辆识别代号:通过特定规则识别
# 规则:查找以"L"开头,长度为17的字符串(VIN码标准格式)
for i, text in enumerate(rec_texts):
if text.startswith("L") and len(text) == 17:
found_info["车辆识别代号"] = {
"keyword": "车辆识别代号",
"value": text,
"box": rec_boxes[i]
}
break
# 3. 发动机号:通过特定规则识别
# 规则1:查找"发动机号码"关键字,取其后面的内容
# 规则2:直接匹配C开头的8-10位字母数字组合
# 先尝试规则1:查找"发动机号码"关键字
engine_found = False
for i, text in enumerate(rec_texts):
if text == "发动机号码" and i + 1 < len(rec_texts):
# 获取后面的内容
potential_engine = rec_texts[i + 1]
# 排除已经识别为号牌号码的内容
if found_info["号牌号码"] and potential_engine == found_info["号牌号码"]["value"]:
continue
# 检查是否为英文标记(如"Engine No.")
if potential_engine in ["Engine No", "Engine No.", "ENGINE NO"]:
# 实际发动机号码在英文标记之后
if i + 2 < len(rec_texts):
actual_engine = rec_texts[i + 2]
found_info["发动机号码"] = {
"keyword": "发动机号码",
"value": actual_engine,
"box": rec_boxes[i + 2]
}
engine_found = True
break
else:
# 直接使用后面的内容作为发动机号码
found_info["发动机号码"] = {
"keyword": "发动机号码",
"value": potential_engine,
"box": rec_boxes[i + 1]
}
engine_found = True
break
# 如果规则1没找到,尝试规则2:直接匹配发动机号码格式
if not engine_found:
for i, text in enumerate(rec_texts):
# 规则:C开头的8-10位字母数字组合,或纯数字的发动机号码
is_engine_format = False
# 条件1:C开头,8-10位,纯字母数字
if text.startswith("C") and 8 <= len(text) <= 10 and text.isalnum():
is_engine_format = True
# 条件2:纯数字,6-10位
elif text.isdigit() and 6 <= len(text) <= 10:
is_engine_format = True
# 条件3:字母数字组合,无空格,8-12位
elif text.isalnum() and 8 <= len(text) <= 12 and not any(c.isspace() for c in text):
is_engine_format = True
if is_engine_format:
# 排除已经识别为号牌号码的内容
if found_info["号牌号码"] and text == found_info["号牌号码"]["value"]:
continue
# 排除VIN码(17位)
if len(text) == 17 and text.startswith("L"):
continue
found_info["发动机号码"] = {
"keyword": "发动机号码",
"value": text,
"box": rec_boxes[i]
}
break
# 4. 住址:通过特定规则识别
# 规则:查找包含省、市、县、镇、路、号等关键词的长文本
for i, text in enumerate(rec_texts):
if len(text) > 8 and any(keyword in text for keyword in ["省", "市", "县", "镇", "乡", "村", "路", "街", "巷", "号", "栋", "单元"]):
found_info["住址"] = {
"keyword": "住址",
"value": text,
"box": rec_boxes[i]
}
break
# 5. 所有人:根据"所有"或"所有人"查找附近的识别结果
# 中国常见姓氏列表(前100个)
common_surnames = [
"王", "李", "张", "刘", "陈", "杨", "赵", "黄", "周", "吴", "徐", "孙", "胡", "朱", "高", "林", "何", "郭", "马", "罗",
"梁", "宋", "郑", "谢", "韩", "唐", "冯", "于", "董", "萧", "程", "曹", "袁", "邓", "许", "傅", "沈", "曾", "彭", "吕",
"苏", "卢", "蒋", "蔡", "贾", "丁", "魏", "薛", "叶", "阎", "余", "潘", "杜", "戴", "夏", "钟", "汪", "田", "任", "姜",
"范", "方", "石", "姚", "谭", "廖", "邹", "熊", "金", "陆", "郝", "孔", "白", "崔", "康", "毛", "邱", "秦", "江", "史",
"顾", "侯", "邵", "孟", "龙", "万", "段", "漕", "钱", "汤", "尹", "黎", "易", "常", "武", "乔", "贺", "赖", "龚", "文"
]
# 查找"所有"或"所有人"关键字的位置
owner_keywords = ["所有人", "所有"]
owner_positions = [i for i, t in enumerate(rec_texts) if any(keyword in t for keyword in owner_keywords)]
# 识别逻辑1:根据关键字附近匹配
owner_found = False
if owner_positions:
# 取第一个匹配的位置
owner_idx = owner_positions[0]
# 查找附近(前后2个位置)的识别结果作为所有人信息
# 优先级:关键字后面 > 关键字前面
search_order = []
# 先添加关键字后面的位置
for i in range(owner_idx + 1, min(len(rec_texts), owner_idx + 3)):
search_order.append(i)
# 再添加关键字前面的位置
for i in range(owner_idx - 1, max(0, owner_idx - 3), -1):
search_order.append(i)
for i in search_order:
text = rec_texts[i]
# 排除规则1:排除明显不是所有人的内容
exclude_list = ["住址", "地址", "Addr", "使用性质", "品牌型号", "车辆类型", "号牌号码", "发动机号码", "车辆识别代号", "注册日期", "发证日期"]
if text in exclude_list:
continue
# 排除规则2:排除英文内容(行驶证所有人通常是中文)
if all(ord(c) < 128 for c in text):
continue
# 排除规则3:排除数字和特殊符号过多的内容
if text.isdigit() or len([c for c in text if not c.isalnum() and not c.isspace()]) > 2:
continue
# 排除规则4:排除过短的内容(除了单个汉字人名)
if len(text) < 1:
continue
# 符合条件,提取为所有人信息
found_info["所有人"] = {
"keyword": "所有人",
"value": text,
"box": rec_boxes[i]
}
owner_found = True
break
# 识别逻辑2:如果还没有找到所有人,尝试查找企业用户
if not owner_found:
for i, text in enumerate(rec_texts):
if any(keyword in text for keyword in ["公司", "有限公司", "企业", "集团", "工厂", "厂", "单位"]):
found_info["所有人"] = {
"keyword": "所有人",
"value": text,
"box": rec_boxes[i]
}
owner_found = True
break
# 识别逻辑3:如果还没有找到所有人,尝试查找以常见姓氏开头的姓名
if not owner_found:
for i, text in enumerate(rec_texts):
# 检查是否以常见姓氏开头
if any(text.startswith(surname) for surname in common_surnames):
# 排除明显不是人名的内容
exclude_list = ["住址", "地址", "Addr", "使用性质", "品牌型号", "车辆类型", "号牌号码", "发动机号码", "车辆识别代号", "注册日期", "发证日期"]
if text in exclude_list:
continue
# 检查长度(姓名通常2-4个汉字)
if 2 <= len(text) <= 4:
found_info["所有人"] = {
"keyword": "所有人",
"value": text,
"box": rec_boxes[i]
}
owner_found = True
break
# 识别逻辑4:如果还没有找到所有人,尝试查找单个汉字人名(通常是姓氏)
if not owner_found:
for i, text in enumerate(rec_texts):
if len(text) == 1 and text.isalpha() and 0x4e00 <= ord(text) <= 0x9fff: # 单个汉字
# 检查是否为常见姓氏
if text in common_surnames:
# 检查是否在所有人相关关键字附近
near_owner = False
for j in range(max(0, i - 3), min(len(rec_texts), i + 3)):
if any(keyword in rec_texts[j] for keyword in owner_keywords):
near_owner = True
break
if near_owner:
found_info["所有人"] = {
"keyword": "所有人",
"value": text,
"box": rec_boxes[i]
}
owner_found = True
break
# 收集敏感信息
sensitive_info = []
for info in found_info.values():
if info is not None:
sensitive_info.append(info)
# 打印敏感信息
print("\n提取的敏感信息:")
for info in sensitive_info:
print(f"{info['keyword']}: {info['value']}")
# 对图片进行打码
image = Image.open(image_file)
draw = ImageDraw.Draw(image)
# 获取检测到的文本多边形坐标
dt_polys = ocr_result.get("dt_polys", [])
# 获取识别到的文本内容
rec_texts = ocr_result.get("rec_texts", [])
# 遍历所有敏感信息
for info in sensitive_info:
# 查找当前敏感信息对应的文本多边形
value = info['value']
# 遍历所有识别到的文本,找到匹配的
for i, text in enumerate(rec_texts):
if text == value and i < len(dt_polys):
# 获取对应的多边形坐标
polygon = dt_polys[i]
# 绘制多边形区域进行打码
draw.polygon([tuple(point) for point in polygon], fill="black")
break
# 特殊处理:如果是号牌号码,还需要处理所有可能的重复位置
if info['keyword'] == "号牌号码" and "all_boxes" in info:
# 对所有号牌号码位置进行打码
for box in info['all_boxes']:
x1, y1, x2, y2 = box
# 绘制矩形进行打码
draw.rectangle([(x1, y1), (x2, y2)], fill="black")
# 保存打码后的图片
output_path = f"output/{image_name}_blurred.jpg"
image.save(output_path)
print(f"\n打码后的图片已保存至:{output_path}")
# 保存敏感信息到文件
sensitive_file = f"output/{image_name}_sensitive_info.json"
with open(sensitive_file, "w", encoding="utf-8") as f:
json.dump(sensitive_info, f, ensure_ascii=False, indent=2)
print(f"敏感信息已保存至:{sensitive_file}")
print("\n所有图片处理完成!")