前言
前段时间同事说他有个需求是比较两个JSON文件之间的差异点,身为DB大神的同事用SQL实现了这个需求,让只会CRUD的我直呼神乎其技。当时用一个一千万多字符、四十多万行的JSON文件来测试,SQL查出来要9秒。周六有时间,拜读了下同事的SQL,打算用Python和Go实现下试试。
测试的json文件如下,其中dst.json从src.json文件复制而来,随便找了个地方改了下。单文件为409510行,字符数为11473154左右。- $ wc -ml ./src.json dst.json
- 409510 11473154 ./src.json
- 409510 11473155 dst.json
- 819020 22946309 总计
复制代码 第三方库jsondiff
先在网上搜了下有没有现成的第三方库,找到一个叫jsondiff的第三方python库。使用pip安装后,用法如下- import json
- import jsondiff
- import os
- from typing import Any
- def read_json(filepath: str) -> Any:
- if not os.path.exists(filepath):
- raise FileNotFoundError(filepath)
- try:
- with open(filepath, "r") as f:
- data = json.load(f)
- except json.JSONDecodeError as e:
- raise Exception(f"{filepath} is not a valid json file") from e
- else:
- return data
-
- if __name__ == "__main__":
- src_data = read_json("src.json")
- dst_data = read_json("dst.json")
- diffs = jsondiff.diff(src_data, dst_data)
- print(diffs)
复制代码 运行测试- $ /usr/bin/time -f 'Elapsed Time: %e s Max RSS: %M kbytes' python third.py
- {'timepicker': {'time_options': {insert: [(7, '7dd')], delete: [7]}}}
- Elapsed Time: 1576.30 s Max RSS: 87732 kbytes
复制代码 运行时间太长了,接近半小时,肯定不能拿给别人用。
Python-仅用标准库
只试了jsondiff这一个第三方库,接下来打算直接参考同事那个SQL的思路,自己只用标准库实现一个。- from typing import Any, List
- import json
- import os
- from dataclasses import dataclass
- from collections.abc import MutableSequence, MutableMapping
- @dataclass
- class DiffResult:
- path: str
- kind: str
- left: Any
- right: Any
- def add_path(parent: str, key: str) -> str:
- """将父路径和key name组合成完整的路径字符串"""
- if parent == "":
- return key
- else:
- return parent + "." + key
-
- def read_json(filepath: str) -> Any:
- if not os.path.exists(filepath):
- raise FileNotFoundError(filepath)
- try:
- with open(filepath, "r") as f:
- data = json.load(f)
- except json.JSONDecodeError as e:
- raise Exception(f"{filepath} is not a valid json file") from e
- else:
- return data
-
- def collect_diff(path: str, left: Any, right: Any) -> List[DiffResult]:
- """比较两个json数据结构之间的差异
-
- Args:
- path (str): 当前路径
- left (Any): 左侧数据
- right (Any): 右侧数据
- Returns:
- List[DiffResult]: 差异列表
- """
- diffs: List[DiffResult] = []
- if isinstance(left, MutableMapping) and isinstance(right, MutableMapping):
- # 处理字典:检查 key 的增删改
- all_keys = set(left.keys()) | set(right.keys()) # 左右两边字典中所有键的并集,用于后续比较这些键在两个字典中的存在情况及对应的值
- for k in all_keys:
- l_exists = k in left
- r_exists = k in right
- key_path = add_path(path, k)
- if l_exists and not r_exists: # 如果一个键只存在于left,则记录为 removed 差异
- diffs.append(DiffResult(key_path, "removed", left=left[k]))
- elif not l_exists and r_exists: # 如果一个键只存在于 right,则记录为 added 差异
- diffs.append(DiffResult(key_path, "added", right=right[k]))
- else:
- # 都存在,递归比较这两个键对应的值
- diffs.extend(collect_diff(key_path, left[k], right[k]))
- elif isinstance(left, MutableSequence) and isinstance(right, MutableSequence):
- # 处理列表:按索引比较
- max_len = max(len(left), len(right)) # 找两个列表中最长的长度
- for i in range(max_len):
- l_exists = i < len(left)
- r_exists = i < len(right)
- idx_path = f"{path}[{i}]"
- lv = left[i] if l_exists else None
- rv = right[i] if r_exists else None
- if l_exists and not r_exists: # 某个索引的元素只存在于 left,则记录为 removed 差异
- diffs.append(DiffResult(idx_path, "removed", left=lv))
- elif not l_exists and r_exists: # 某个索引的元素只存在于 right,则记录为 added 差异
- diffs.append(DiffResult(idx_path, "added", right=rv))
- else: # 都存在,递归比较这两个索引对应的值
- diffs.extend(collect_diff(idx_path, lv, rv))
- else:
- # 基本类型或类型不一致
- if left != right:
- diffs.append(DiffResult(path, "modified", left=left, right=right))
- return diffs
- if __name__ == "__main__":
- src_dict = read_json("src.json")
- dst_dict = read_json("dst.json")
- diffs = collect_diff("", src_dict, dst_dict)
- if len(diffs) == 0:
- print("No differences found.")
- else:
- print(f"Found {len(diffs)} differences:")
- for diff in diffs:
- match diff.kind:
- case "added":
- print(f"Added: {diff.path}, {diff.right}")
- case "removed":
- print(f"Removed: {diff.path}, {diff.left}")
- case "modified":
- print(f"Modified: {diff.path}, {diff.left} -> {diff.right}")
- # print(diffs)
复制代码 运行测试- $ /usr/bin/time -f 'Elapsed Time: %e s Max RSS: %M kbytes' python main.py
- Found 1 differences:
- Modified: timepicker.time_options[7], 7d -> 7dd
- Elapsed Time: 0.46 s Max RSS: 87976 kbytes
复制代码 只要 0.46 秒就能比较出来差异点,单论比较性能来说,比jsondiff要好很多。
Go实现
再换go来实现个命令行工具,同样只需要用标准库即可。- package main
- import (
- "encoding/json"
- "flag"
- "fmt"
- "io"
- "os"
- )
- var (
- src_file string
- dst_file string
- )
- type DiffResult struct {
- Path string
- Kind string
- Left any
- Right any
- }
- func addPath(parent, key string) string {
- if parent == "" {
- return key
- }
- return parent + "." + key
- }
- func collectDiff(path string, left, right any) []DiffResult {
- var diffs []DiffResult
- switch l := left.(type) {
- case map[string]any:
- if r, ok := right.(map[string]any); ok {
- for k, lv := range l {
- rk, exists := r[k]
- if !exists {
- diffs = append(diffs, DiffResult{
- Path: addPath(path, k),
- Kind: "removed",
- Left: lv,
- Right: nil,
- })
- } else {
- diffs = append(diffs, collectDiff(addPath(path, k), lv, rk)...)
- }
- }
- for k, rv := range r {
- if _, exists := l[k]; !exists {
- diffs = append(diffs, DiffResult{
- Path: addPath(path, k),
- Kind: "added",
- Left: nil,
- Right: rv,
- })
- }
- }
- } else {
- diffs = append(diffs, DiffResult{
- Path: path,
- Kind: "modified",
- Left: left,
- Right: right,
- })
- }
- case []any:
- if r, ok := right.([]any); ok {
- // 比较 slice(这里简化:按索引比较)
- maxLen := len(l)
- if len(r) > maxLen {
- maxLen = len(r)
- }
- for i := 0; i < maxLen; i++ {
- var lv, rv any
- var lExists, rExists bool
- if i < len(l) {
- lv = l[i]
- lExists = true
- }
- if i < len(r) {
- rv = r[i]
- rExists = true
- }
- switch {
- case lExists && !rExists:
- diffs = append(diffs, DiffResult{
- Path: fmt.Sprintf("%s[%d]", path, i),
- Kind: "removed",
- Left: lv,
- Right: nil,
- })
- case !lExists && rExists:
- diffs = append(diffs, DiffResult{
- Path: fmt.Sprintf("%s[%d]", path, i),
- Kind: "added",
- Left: nil,
- Right: rv,
- })
- case lExists && rExists:
- diffs = append(diffs, collectDiff(fmt.Sprintf("%s[%d]", path, i), lv, rv)...)
- }
- }
- } else {
- diffs = append(diffs, DiffResult{
- Path: path,
- Kind: "modified",
- Left: left,
- Right: right,
- })
- }
- default:
- if fmt.Sprintf("%v", left) != fmt.Sprintf("%v", right) {
- diffs = append(diffs, DiffResult{
- Path: path,
- Kind: "modified",
- Left: left,
- Right: right,
- })
- }
- }
- return diffs
- }
- func readJSON(r io.Reader) (map[string]any, error) {
- var data map[string]any
- decoder := json.NewDecoder(r)
- if err := decoder.Decode(&data); err != nil {
- return nil, err
- }
- return data, nil
- }
- func main() {
- flag.StringVar(&src_file, "src", "src.json", "source file")
- flag.StringVar(&dst_file, "dst", "dst.json", "destination file")
- flag.Parse()
- srcFile, err := os.Open(src_file)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Error opening src.json: %v\n", err)
- return
- }
- defer srcFile.Close()
- dstFile, err := os.Open(dst_file)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Error opening dst.json: %v\n", err)
- return
- }
- defer dstFile.Close()
- srcJson, err := readJSON(srcFile)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Error reading src.json: %v\n", err)
- return
- }
- dstJson, err := readJSON(dstFile)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Error reading dst.json: %v\n", err)
- return
- }
- diffs := collectDiff("", srcJson, dstJson)
- if len(diffs) == 0 {
- fmt.Println("No differences found.")
- } else {
- fmt.Printf("%d differences found:\n", len(diffs))
- for _, diff := range diffs {
- switch diff.Kind {
- case "added":
- fmt.Printf("Added: %s: %v\n", diff.Path, diff.Right)
- case "removed":
- fmt.Printf("Removed: %s: %v\n", diff.Path, diff.Left)
- case "modified":
- fmt.Printf("Modified: %s: %v -> %v\n", diff.Path, diff.Left, diff.Right)
- }
- }
- }
- }
复制代码 运行测试,速度同样很快。- $ /usr/bin/time -f 'Elapsed Time: %e s Max RSS: %Mkbytes' ./diffjson -src ./src.json -dst ./dst.json
- 1 differences found:
- Modified: timepicker.time_options[7]: 7d -> 7dd
- Elapsed Time: 0.29 s Max RSS: 117468 kbytes
复制代码 来源:程序园用户自行投稿发布,如果侵权,请联系站长删除
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作! |