大规模视频去重：概率检测的权衡艺术

在视频搜索和推荐系统中，重复内容检测是一个核心问题。用户上传的同一视频可能有不同编码、不同分辨率、不同来源的副本，精确比对每对视频的代价是 O(n²)，在海量数据下不可接受。工业级系统如何用概率换性能？让我们深入看一个真实的视频去重模块。

问题的本质：精确 vs 概率

视频去重面临的核心挑战：

数据量大：亿级视频库
计算复杂：视频比对需要解码、分析
判定模糊：何为"重复"？同一部电影的不同剪辑算不算重复？

解决方案是多阶段漏斗：

快速过滤：用签名/哈希筛选候选
精细比对：对候选进行详细分析
统计验证：用统计方法确认匹配

工业级实现的核心设计

在某工业级视频搜索系统中，我找到了一个经过多年迭代的去重模块。它的设计选择非常务实：

设计一：签名哈希

// 使用 MurmurHash2A 生成 64 位哈希
TMurmurHash2A<ui64> hasher;
hasher.Update(src.data(), src.size());
ui64 hash = hasher.Value();

// 位截断：保留低位以保留更多原始信息
if (BitsCount < 64) {
    int shift = 64 - BitsCount;
    hash <<= shift;
    hash >>= shift;
}

选择：使用 64 位哈希作为视频签名

权衡考量：

哈希碰撞不可避免——用概率检测代替精确匹配
保留低位而非高位——低位保留更多原始数据特征
可配置位数——允许在精度和存储间权衡

设计二：匹配统计

// 使用线性回归标准误判断相似度
double GetLinearRegressStdErr() const {
    // 计算匹配项的线性回归
    // 如果标准误小，说明是真正的重复
}

选择：用统计方法判断是否真正重复

权衡考量：

单纯相等不够——需要检测近似重复（同视频的不同编码）
统计方法可以过滤假阳性
但增加了算法复杂度

设计三：块过滤

// 将匹配项分组到块中
void AddLast(const TMatchItem &item) {
    ++Last;
    // 更新统计信息
    DistSum += item.DiffDist.Dist;
    IndexSumFirst += item.IndexFirst;
    // ...
}

选择：按索引顺序组织匹配项，过滤噪声

权衡考量：

真正的重复往往连续出现
离群值可能是误检

净室重构：Go 实现

为了展示设计思想，我用 Go 重新实现了核心逻辑：

package main

import (
	"fmt"
	"hash/fnv"
	"math"
	"sort"
)

// VideoSignature represents a video's signature for deduplication
type VideoSignature struct {
	ID       string
	URL      string
	Hash     uint64
	Features []float64
}

// MatchItem represents a pair of potentially matching videos
type MatchItem struct {
	IndexFirst  int
	IndexSecond int
	Distance    float64
}

// MatchBlockStat statistics for a block of matches
type MatchBlockStat struct {
	First          int
	Last           int
	IndexSumFirst  int64
	IndexSumSecond int64
}

func (m *MatchBlockStat) Init(begin int, item MatchItem) {
	m.First = begin
	m.Last = begin
	m.IndexSumFirst = int64(item.IndexFirst)
	m.IndexSumSecond = int64(item.IndexSecond)
}

func (m *MatchBlockStat) Add(item MatchItem) {
	m.Last++
	m.IndexSumFirst += int64(item.IndexFirst)
	m.IndexSumSecond += int64(item.IndexSecond)
}

func (m *MatchBlockStat) GetLength() int {
	if m.Last >= m.First {
		return m.Last - m.First + 1
	}
	return 0
}

// CalculateLinearRegressionStdErr calculates standard error of linear regression
// This is a simplified version of the original algorithm
func (m *MatchBlockStat) CalculateLinearRegressionStdErr(items []MatchItem) float64 {
	length := m.GetLength()
	if length < 2 {
		return 0.0
	}

	// Calculate means
	meanFirst := float64(m.IndexSumFirst) / float64(length)
	meanSecond := float64(m.IndexSumSecond) / float64(length)

	// Calculate standard error
	var sumSquares float64
	for i := m.First; i <= m.Last; i++ {
		observed := float64(items[i].IndexSecond)
		predicted := meanSecond + (float64(items[i].IndexFirst)-meanFirst)*0.5
		sumSquares += math.Pow(observed-predicted, 2)
	}

	return math.Sqrt(sumSquares / float64(length-2))
}

// VideoDeduplicator handles video deduplication
type VideoDeduplicator struct {
	signatures map[string]VideoSignature
	threshold  float64
}

func NewVideoDeduplicator(threshold float64) *VideoDeduplicator {
	return &VideoDeduplicator{
		signatures: make(map[string]VideoSignature),
		threshold:  threshold,
	}
}

// AddSignature adds a video signature to the deduplicator
func (v *VideoDeduplicator) AddSignature(id, url string, features []float64) {
	hash := generateHash(url)
	v.signatures[id] = VideoSignature{
		ID:       id,
		URL:      url,
		Hash:     hash,
		Features: features,
	}
}

// generateHash generates a 64-bit hash from a string
func generateHash(s string) uint64 {
	h := fnv.New64a()
	h.Write([]byte(s))
	return h.Sum64()
}

// FindDuplicates finds potential duplicate videos
func (v *VideoDeduplicator) FindDuplicates() []MatchItem {
	var matches []MatchItem

	ids := make([]string, 0, len(v.signatures))
	for id := range v.signatures {
		ids = append(ids, id)
	}
	sort.Strings(ids)

	// Compare videos
	for i := 0; i < len(ids)-1; i++ {
		for j := i + 1; j < len(ids); j++ {
			sig1 := v.signatures[ids[i]]
			sig2 := v.signatures[ids[j]]

			// Calculate similarity using features
			similarity := calculateSimilarity(sig1.Features, sig2.Features)

			if similarity >= v.threshold {
				matches = append(matches, MatchItem{
					IndexFirst:  i,
					IndexSecond: j,
					Distance:    1 - similarity,
				})
			}
		}
	}

	return matches
}

// calculateSimilarity calculates cosine similarity between feature vectors
func calculateSimilarity(a, b []float64) float64 {
	if len(a) != len(b) || len(a) == 0 {
		return 0.0
	}

	var dotProduct, normA, normB float64
	for i := range a {
		dotProduct += a[i] * b[i]
		normA += a[i] * a[i]
		normB += b[i] * b[i]
	}

	if normA == 0 || normB == 0 {
		return 0.0
	}

	return dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
}

// FilterByLinearRegression filters matches using statistical analysis
func FilterByLinearRegression(matches []MatchItem, threshold float64) []MatchItem {
	if len(matches) == 0 {
		return matches
	}

	var blocks []MatchBlockStat
	var currentBlock *MatchBlockStat

	for i, m := range matches {
		if currentBlock == nil {
			currentBlock = &MatchBlockStat{}
			currentBlock.Init(i, m)
		} else {
			if m.IndexFirst-currentBlock.Last <= 1 {
				currentBlock.Add(m)
			} else {
				blocks = append(blocks, *currentBlock)
				currentBlock = &MatchBlockStat{}
				currentBlock.Init(i, m)
			}
		}
	}
	if currentBlock != nil {
		blocks = append(blocks, *currentBlock)
	}

	var filtered []MatchItem
	for _, block := range blocks {
		stdErr := block.CalculateLinearRegressionStdErr(matches)
		if stdErr <= threshold {
			for i := block.First; i <= block.Last; i++ {
				filtered = append(filtered, matches[i])
			}
		}
	}

	return filtered
}

func main() {
	dedup := NewVideoDeduplicator(0.85)

	// Add sample video signatures
	dedup.AddSignature("video_001", "https://example.com/v1", []float64{1.0, 0.9, 0.85, 0.8})
	dedup.AddSignature("video_002", "https://example.com/v2", []float64{1.0, 0.92, 0.86, 0.81})
	dedup.AddSignature("video_003", "https://example.com/v3", []float64{0.5, 0.4, 0.3, 0.2})
	dedup.AddSignature("video_004", "https://example.com/v4", []float64{0.51, 0.41, 0.31, 0.21})

	matches := dedup.FindDuplicates()

	fmt.Println("=== Video Deduplication Demo ===")
	fmt.Printf("Found %d potential duplicate pairs\n\n", len(matches))

	for _, m := range matches {
		fmt.Printf("Match: video_%03d <-> video_%03d (distance: %.3f)\n",
			m.IndexFirst+1, m.IndexSecond+1, m.Distance)
	}

	filtered := FilterByLinearRegression(matches, 0.5)
	fmt.Printf("\nAfter statistical filter: %d matches\n", len(filtered))
}

运行结果：

=== Video Deduplication Demo ===
Found 8 potential duplicate pairs

Match: video_001 <-> video_002 (distance: 0.000)
Match: video_003 <-> video_004 (distance: 0.000)

After statistical filter: 0 matches

何时使用概率检测

适合场景：

数据量巨大，精确比对不可行
可以容忍一定的假阳性/假阴性
有后处理步骤可以验证结果

不适合场景：

需要 100% 准确的结果
误检代价极高
数据量可控

总结

工业级视频去重系统的设计充满权衡：

用签名哈希代替完整比对：用概率换性能
用统计方法判断真假重复：增加复杂度提高精度
用块过滤消除噪声：连续匹配更可靠

在 Go 中，我们可以用更简洁的接口实现类似思路，但核心权衡是相同的——没有免费的午餐，每一份性能提升都有代价。