limengwei/ocr

代码拉取完成,页面将自动刷新

扫描微信二维码支付

取消

支付完成

richgiteeai

Watch

不关注关注所有动态仅关注版本发行动态关注但不提醒动态

1 Star 0 Fork 0

limengwei/ocr

代码 Issues 0 Pull Requests 0 Wiki 统计流水线

服务

加入 Gitee

与超过 1400万开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)

免费加入

已有帐号? 立即登录

克隆/下载

HTTPS SSH SVN SVN+SSH 下载ZIP

提示

下载代码请复制以下命令到终端执行

为确保你提交的代码身份被 Gitee 正确识别,请执行以下命令完成配置

git config --global user.name userName 
git config --global user.email userEmail

初次使用 SSH 协议进行代码克隆、推送等操作时,需按下述提示完成 SSH 配置

1 生成 RSA 密钥

2 获取 RSA 公钥内容,并配置到 SSH公钥中

在 Gitee 上使用 SVN,请访问使用指南

使用 HTTPS 协议时,命令行会出现如下账号密码验证步骤。基于安全考虑,Gitee 建议配置并使用私人令牌替代登录密码进行克隆、推送等操作

Username for 'https://gitee.com': userName

Password for 'https://userName@gitee.com': # 私人令牌

新建文件新建 Diagram 文件

新建子模块

上传文件

分支 1

标签 0

贡献代码

同步代码

创建 Pull Request

了解更多

对比差异通过 Pull Request 同步

同步更新到分支

通过 Pull Request 同步

将会在向当前分支创建一个 Pull
Request,合入后将完成同步

limengwei 1.0 e867e86

1 次提交

images

output

demo.py

license_plate_ocr.py

行驶证OCR敏感信息提取与容错机制说明.md

未知许可证

from paddleocr import PaddleOCR import json from PIL import Image, ImageDraw, ImageFont import os import glob # 初始化 PaddleOCR 实例 ocr = PaddleOCR( use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False) # 启用文字方向检测 # 敏感信息关键字和对应的索引 sensitive_keywords = [ "号牌号码", "车辆识别代号", "发动机号码", "住址", "所有人" ] # 遍历images目录下的所有图片文件 image_files = glob.glob("images/*.jpg") + glob.glob("images/*.png") + glob.glob("images/*.jpeg") for image_file in image_files: print(f"\n正在处理图片:{image_file}") # 获取图片文件名(不包含路径和扩展名) image_name = os.path.splitext(os.path.basename(image_file))[0] # 对图像执行 OCR 推理 result = ocr.predict( input=image_file) # 可视化结果并保存 json 结果 for res in result: res.print() res.save_to_img("output") res.save_to_json("output") # 读取识别结果 json_file = f"output/{image_name}_res.json" with open(json_file, "r", encoding="utf-8") as f: ocr_result = json.load(f) # 提取敏感信息 rec_texts = ocr_result["rec_texts"] rec_boxes = ocr_result["rec_boxes"] # 创建敏感信息映射字典,用于存储已找到的敏感信息 found_info = { "号牌号码": None, "车辆识别代号": None, "发动机号码": None, "住址": None, "所有人": None } # 1. 号码号牌:通过特定规则识别 # 规则1:查找"号牌号码"关键字,取其后面的内容 # 规则2:匹配以省份简称开头的号牌号码格式 # 特殊处理:处理行驶证正副页在一张图片上导致的重复识别,保留所有唯一的号牌号码值 # 优化:处理省份简称识别错误问题,通过容错映射和上下文验证 # 中国各省份简称列表 province_codes = ["京", "津", "冀", "晋", "蒙", "辽", "吉", "黑", "沪", "苏", "浙", "皖", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "渝", "川", "黔", "滇", "藏", "陕", "甘", "青", "宁", "新", "港", "澳", "台"] # 省份简称容错映射:容易混淆的字映射到正确的省份简称 province_fault_tolerance = { "相": "湘", "箱": "湘", "香": "湘", "川": "川", "州": "川", "卅": "川", "云": "云", "去": "云", "厶": "云", "陕": "陕", "峡": "陕", "侠": "陕", "鲁": "鲁", "鱼": "鲁", "卤": "鲁", "售": "鲁", "豫": "豫", "予": "豫", "像": "豫", "粤": "粤", "曰": "粤", "奥": "粤", "桂": "桂", "挂": "桂", "贵": "桂", "苏": "苏", "办": "苏", "束": "苏", "浙": "浙", "折": "浙", "哲": "浙", "皖": "皖", "完": "皖", "碗": "皖", "闽": "闽", "门": "闽", "闵": "闽", "赣": "赣", "干": "赣", "贡": "赣", "鄂": "鄂", "e": "鄂", "哦": "鄂", "渝": "渝", "俞": "渝", "于": "渝", "晋": "晋", "普": "晋", "普": "晋", "冀": "冀", "翼": "冀", "记": "冀", "辽": "辽", "了": "辽", "疗": "辽", "吉": "吉", "古": "吉", "告": "吉", "黑": "黑", "里": "黑", "墨": "黑", "沪": "沪", "户": "沪", "护": "沪", "蒙": "蒙", "孟": "蒙", "猛": "蒙", "津": "津", "金": "津", "今": "津", "京": "京", "惊": "京", "景": "京", "琼": "琼", "穷": "琼", "穹": "琼", "宁": "宁", "宁": "宁", "柠": "宁", "青": "青", "清": "青", "轻": "青", "甘": "甘", "干": "甘", "敢": "甘", "新": "新", "辛": "新", "心": "新", "藏": "藏", "zang": "藏", "葬": "藏" } # 收集所有可能的号牌号码 potential_plates = [] # 规则1:查找"号牌号码"关键字 for i, text in enumerate(rec_texts): if text == "号牌号码" and i + 1 < len(rec_texts): # 获取潜在的车牌号码 potential_plate = rec_texts[i + 1] # 验证并修正省份简称 if len(potential_plate) >= 6: # 获取识别到的省份简称 identified_code = potential_plate[0] # 检查是否为正确的省份简称 if identified_code not in province_codes: # 如果不是,尝试容错映射 if identified_code in province_fault_tolerance: # 使用容错映射修正省份简称 corrected_plate = province_fault_tolerance[identified_code] + potential_plate[1:] potential_plates.append({ "value": corrected_plate, "box": rec_boxes[i + 1] }) else: # 省份简称正确,直接添加 potential_plates.append({ "value": potential_plate, "box": rec_boxes[i + 1] }) # 规则2:直接匹配以省份简称开头的号牌号码 for i, text in enumerate(rec_texts): if 6 <= len(text) <= 8: # 获取识别到的省份简称 identified_code = text[0] is_valid = False corrected_plate = text # 检查是否为正确的省份简称 if identified_code in province_codes: is_valid = True elif identified_code in province_fault_tolerance: # 使用容错映射修正 corrected_plate = province_fault_tolerance[identified_code] + text[1:] is_valid = True if is_valid: # 始终添加到potential_plates中,用于收集所有位置 potential_plates.append({ "value": corrected_plate, "box": rec_boxes[i] }) # 处理重复情况:保留所有唯一的号牌号码值 unique_plates = [] seen_values = set() all_plate_boxes = [] # 用于存储所有需要打码的号牌号码位置 for plate in potential_plates: if plate["value"] not in seen_values: seen_values.add(plate["value"]) unique_plates.append(plate) all_plate_boxes.append(plate["box"]) # 设置最终的号牌号码值(用于结果输出,取第一个) if unique_plates: found_info["号牌号码"] = { "keyword": "号牌号码", "value": unique_plates[0]["value"], "box": unique_plates[0]["box"] } # 将所有需要打码的号牌号码位置保存到found_info中,用于后续打码 found_info["号牌号码"]["all_boxes"] = all_plate_boxes # 2. 车辆识别代号:通过特定规则识别 # 规则:查找以"L"开头,长度为17的字符串(VIN码标准格式) for i, text in enumerate(rec_texts): if text.startswith("L") and len(text) == 17: found_info["车辆识别代号"] = { "keyword": "车辆识别代号", "value": text, "box": rec_boxes[i] } break # 3. 发动机号:通过特定规则识别 # 规则1:查找"发动机号码"关键字,取其后面的内容 # 规则2:直接匹配C开头的8-10位字母数字组合 # 先尝试规则1:查找"发动机号码"关键字 engine_found = False for i, text in enumerate(rec_texts): if text == "发动机号码" and i + 1 < len(rec_texts): # 获取后面的内容 potential_engine = rec_texts[i + 1] # 排除已经识别为号牌号码的内容 if found_info["号牌号码"] and potential_engine == found_info["号牌号码"]["value"]: continue # 检查是否为英文标记(如"Engine No.") if potential_engine in ["Engine No", "Engine No.", "ENGINE NO"]: # 实际发动机号码在英文标记之后 if i + 2 < len(rec_texts): actual_engine = rec_texts[i + 2] found_info["发动机号码"] = { "keyword": "发动机号码", "value": actual_engine, "box": rec_boxes[i + 2] } engine_found = True break else: # 直接使用后面的内容作为发动机号码 found_info["发动机号码"] = { "keyword": "发动机号码", "value": potential_engine, "box": rec_boxes[i + 1] } engine_found = True break # 如果规则1没找到,尝试规则2:直接匹配发动机号码格式 if not engine_found: for i, text in enumerate(rec_texts): # 规则:C开头的8-10位字母数字组合,或纯数字的发动机号码 is_engine_format = False # 条件1:C开头,8-10位,纯字母数字 if text.startswith("C") and 8 <= len(text) <= 10 and text.isalnum(): is_engine_format = True # 条件2:纯数字,6-10位 elif text.isdigit() and 6 <= len(text) <= 10: is_engine_format = True # 条件3:字母数字组合,无空格,8-12位 elif text.isalnum() and 8 <= len(text) <= 12 and not any(c.isspace() for c in text): is_engine_format = True if is_engine_format: # 排除已经识别为号牌号码的内容 if found_info["号牌号码"] and text == found_info["号牌号码"]["value"]: continue # 排除VIN码(17位) if len(text) == 17 and text.startswith("L"): continue found_info["发动机号码"] = { "keyword": "发动机号码", "value": text, "box": rec_boxes[i] } break # 4. 住址:通过特定规则识别 # 规则:查找包含省、市、县、镇、路、号等关键词的长文本 for i, text in enumerate(rec_texts): if len(text) > 8 and any(keyword in text for keyword in ["省", "市", "县", "镇", "乡", "村", "路", "街", "巷", "号", "栋", "单元"]): found_info["住址"] = { "keyword": "住址", "value": text, "box": rec_boxes[i] } break # 5. 所有人:根据"所有"或"所有人"查找附近的识别结果 # 中国常见姓氏列表(前100个) common_surnames = [ "王", "李", "张", "刘", "陈", "杨", "赵", "黄", "周", "吴", "徐", "孙", "胡", "朱", "高", "林", "何", "郭", "马", "罗", "梁", "宋", "郑", "谢", "韩", "唐", "冯", "于", "董", "萧", "程", "曹", "袁", "邓", "许", "傅", "沈", "曾", "彭", "吕", "苏", "卢", "蒋", "蔡", "贾", "丁", "魏", "薛", "叶", "阎", "余", "潘", "杜", "戴", "夏", "钟", "汪", "田", "任", "姜", "范", "方", "石", "姚", "谭", "廖", "邹", "熊", "金", "陆", "郝", "孔", "白", "崔", "康", "毛", "邱", "秦", "江", "史", "顾", "侯", "邵", "孟", "龙", "万", "段", "漕", "钱", "汤", "尹", "黎", "易", "常", "武", "乔", "贺", "赖", "龚", "文" ] # 查找"所有"或"所有人"关键字的位置 owner_keywords = ["所有人", "所有"] owner_positions = [i for i, t in enumerate(rec_texts) if any(keyword in t for keyword in owner_keywords)] # 识别逻辑1:根据关键字附近匹配 owner_found = False if owner_positions: # 取第一个匹配的位置 owner_idx = owner_positions[0] # 查找附近(前后2个位置)的识别结果作为所有人信息 # 优先级:关键字后面 > 关键字前面 search_order = [] # 先添加关键字后面的位置 for i in range(owner_idx + 1, min(len(rec_texts), owner_idx + 3)): search_order.append(i) # 再添加关键字前面的位置 for i in range(owner_idx - 1, max(0, owner_idx - 3), -1): search_order.append(i) for i in search_order: text = rec_texts[i] # 排除规则1:排除明显不是所有人的内容 exclude_list = ["住址", "地址", "Addr", "使用性质", "品牌型号", "车辆类型", "号牌号码", "发动机号码", "车辆识别代号", "注册日期", "发证日期"] if text in exclude_list: continue # 排除规则2:排除英文内容(行驶证所有人通常是中文) if all(ord(c) < 128 for c in text): continue # 排除规则3:排除数字和特殊符号过多的内容 if text.isdigit() or len([c for c in text if not c.isalnum() and not c.isspace()]) > 2: continue # 排除规则4:排除过短的内容(除了单个汉字人名) if len(text) < 1: continue # 符合条件,提取为所有人信息 found_info["所有人"] = { "keyword": "所有人", "value": text, "box": rec_boxes[i] } owner_found = True break # 识别逻辑2:如果还没有找到所有人,尝试查找企业用户 if not owner_found: for i, text in enumerate(rec_texts): if any(keyword in text for keyword in ["公司", "有限公司", "企业", "集团", "工厂", "厂", "单位"]): found_info["所有人"] = { "keyword": "所有人", "value": text, "box": rec_boxes[i] } owner_found = True break # 识别逻辑3:如果还没有找到所有人,尝试查找以常见姓氏开头的姓名 if not owner_found: for i, text in enumerate(rec_texts): # 检查是否以常见姓氏开头 if any(text.startswith(surname) for surname in common_surnames): # 排除明显不是人名的内容 exclude_list = ["住址", "地址", "Addr", "使用性质", "品牌型号", "车辆类型", "号牌号码", "发动机号码", "车辆识别代号", "注册日期", "发证日期"] if text in exclude_list: continue # 检查长度(姓名通常2-4个汉字) if 2 <= len(text) <= 4: found_info["所有人"] = { "keyword": "所有人", "value": text, "box": rec_boxes[i] } owner_found = True break # 识别逻辑4:如果还没有找到所有人,尝试查找单个汉字人名(通常是姓氏) if not owner_found: for i, text in enumerate(rec_texts): if len(text) == 1 and text.isalpha() and 0x4e00 <= ord(text) <= 0x9fff: # 单个汉字 # 检查是否为常见姓氏 if text in common_surnames: # 检查是否在所有人相关关键字附近 near_owner = False for j in range(max(0, i - 3), min(len(rec_texts), i + 3)): if any(keyword in rec_texts[j] for keyword in owner_keywords): near_owner = True break if near_owner: found_info["所有人"] = { "keyword": "所有人", "value": text, "box": rec_boxes[i] } owner_found = True break # 收集敏感信息 sensitive_info = [] for info in found_info.values(): if info is not None: sensitive_info.append(info) # 打印敏感信息 print("\n提取的敏感信息:") for info in sensitive_info: print(f"{info['keyword']}: {info['value']}") # 对图片进行打码 image = Image.open(image_file) draw = ImageDraw.Draw(image) # 获取检测到的文本多边形坐标 dt_polys = ocr_result.get("dt_polys", []) # 获取识别到的文本内容 rec_texts = ocr_result.get("rec_texts", []) # 遍历所有敏感信息 for info in sensitive_info: # 查找当前敏感信息对应的文本多边形 value = info['value'] # 遍历所有识别到的文本,找到匹配的 for i, text in enumerate(rec_texts): if text == value and i < len(dt_polys): # 获取对应的多边形坐标 polygon = dt_polys[i] # 绘制多边形区域进行打码 draw.polygon([tuple(point) for point in polygon], fill="black") break # 特殊处理:如果是号牌号码,还需要处理所有可能的重复位置 if info['keyword'] == "号牌号码" and "all_boxes" in info: # 对所有号牌号码位置进行打码 for box in info['all_boxes']: x1, y1, x2, y2 = box # 绘制矩形进行打码 draw.rectangle([(x1, y1), (x2, y2)], fill="black") # 保存打码后的图片 output_path = f"output/{image_name}_blurred.jpg" image.save(output_path) print(f"\n打码后的图片已保存至:{output_path}") # 保存敏感信息到文件 sensitive_file = f"output/{image_name}_sensitive_info.json" with open(sensitive_file, "w", encoding="utf-8") as f: json.dump(sensitive_info, f, ensure_ascii=False, indent=2) print(f"敏感信息已保存至:{sensitive_file}") print("\n所有图片处理完成!")