Large-Scale Video Deduplication: The Art of Probabilistic Trade-offs

In video search and recommendation systems, duplicate content detection is a core challenge. The same video uploaded by users may have different encodings, resolutions, or sources. Comparing every video pair exactly would be O(n²) — prohibitively expensive at scale. How do industrial systems trade probability for performance? Let's dive into a real video deduplication module.

The Core Problem: Exact vs. Probabilistic

The fundamental challenge in video deduplication:

Massive scale: Billions of videos
Computational complexity: Video comparison requires decoding and analysis
Fuzzy boundaries: What counts as a "duplicate"? Different edits of the same movie?

The solution is a multi-stage funnel:

Fast filtering: Use signatures/hashes to narrow candidates
Fine-grained comparison: Analyze candidates in detail
Statistical verification: Use statistics to confirm matches

Core Design in Industrial Implementation

In an industrial-grade video search system, I found a deduplication module that has been refined over years. Its design choices are remarkably pragmatic:

Design One: Signature Hashing

// Use MurmurHash2A to generate 64-bit hash
TMurmurHash2A<ui64> hasher;
hasher.Update(src.data(), src.size());
ui64 hash = hasher.Value();

// Bit truncation: keep lower bits to preserve more original information
if (BitsCount < 64) {
    int shift = 64 - BitsCount;
    hash <<= shift;
    hash >>= shift;
}

Choice: Use 64-bit hash as video signature

Trade-off considerations:

Hash collisions are inevitable — use probabilistic detection instead of exact matching
Keep lower bits vs. higher bits — lower bits preserve more original data characteristics
Configurable bits — allows trading precision for storage

Design Two: Match Statistics

// Use linear regression standard error to judge similarity
double GetLinearRegressStdErr() const {
    // Calculate linear regression of matching items
    // If standard error is small, it's a real duplicate
}

Choice: Use statistical methods to determine true duplicates

Trade-off considerations:

Simple equality isn't enough — need to detect approximate duplicates (same video, different encodings)
Statistical methods can filter false positives
But increases algorithm complexity

Design Three: Block Filtering

// Organize matching items into blocks by index order
void AddLast(const TMatchItem &item) {
    ++Last;
    // Update statistics
    DistSum += item.DiffDist.Dist;
    IndexSumFirst += item.IndexFirst;
    // ...
}

Choice: Organize matches by index sequence, filter noise

Trade-off considerations:

True duplicates tend to appear consecutively
Outliers may be false positives

Clean-room Reimplementation: Go Implementation

To demonstrate the design thinking, I reimplemented the core logic in Go:

package main

import (
	"fmt"
	"hash/fnv"
	"math"
	"sort"
)

type VideoSignature struct {
	ID       string
	URL      string
	Hash     uint64
	Features []float64
}

type MatchItem struct {
	IndexFirst  int
	IndexSecond int
	Distance    float64
}

type MatchBlockStat struct {
	First          int
	Last           int
	IndexSumFirst  int64
	IndexSumSecond int64
}

func (m *MatchBlockStat) Init(begin int, item MatchItem) {
	m.First = begin
	m.Last = begin
	m.IndexSumFirst = int64(item.IndexFirst)
	m.IndexSumSecond = int64(item.IndexSecond)
}

func (m *MatchBlockStat) Add(item MatchItem) {
	m.Last++
	m.IndexSumFirst += int64(item.IndexFirst)
	m.IndexSumSecond += int64(item.IndexSecond)
}

func (m *MatchBlockStat) GetLength() int {
	if m.Last >= m.First {
		return m.Last - m.First + 1
	}
	return 0
}

func (m *MatchBlockStat) CalculateLinearRegressionStdErr(items []MatchItem) float64 {
	length := m.GetLength()
	if length < 2 {
		return 0.0
	}
	meanFirst := float64(m.IndexSumFirst) / float64(length)
	meanSecond := float64(m.IndexSumSecond) / float64(length)
	var sumSquares float64
	for i := m.First; i <= m.Last; i++ {
		observed := float64(items[i].IndexSecond)
		predicted := meanSecond + (float64(items[i].IndexFirst)-meanFirst)*0.5
		sumSquares += math.Pow(observed-predicted, 2)
	}
	return math.Sqrt(sumSquares / float64(length-2))
}

type VideoDeduplicator struct {
	signatures map[string]VideoSignature
	threshold  float64
}

func NewVideoDeduplicator(threshold float64) *VideoDeduplicator {
	return &VideoDeduplicator{
		signatures: make(map[string]VideoSignature),
		threshold:  threshold,
	}
}

func (v *VideoDeduplicator) AddSignature(id, url string, features []float64) {
	hash := generateHash(url)
	v.signatures[id] = VideoSignature{ID: id, URL: url, Hash: hash, Features: features}
}

func generateHash(s string) uint64 {
	h := fnv.New64a()
	h.Write([]byte(s))
	return h.Sum64()
}

func (v *VideoDeduplicator) FindDuplicates() []MatchItem {
	var matches []MatchItem
	ids := make([]string, 0, len(v.signatures))
	for id := range v.signatures {
		ids = append(ids, id)
	}
	sort.Strings(ids)
	for i := 0; i < len(ids)-1; i++ {
		for j := i + 1; j < len(ids); j++ {
			sig1 := v.signatures[ids[i]]
			sig2 := v.signatures[ids[j]]
			similarity := calculateSimilarity(sig1.Features, sig2.Features)
			if similarity >= v.threshold {
				matches = append(matches, MatchItem{IndexFirst: i, IndexSecond: j, Distance: 1 - similarity})
			}
		}
	}
	return matches
}

func calculateSimilarity(a, b []float64) float64 {
	if len(a) != len(b) || len(a) == 0 {
		return 0.0
	}
	var dotProduct, normA, normB float64
	for i := range a {
		dotProduct += a[i] * b[i]
		normA += a[i] * a[i]
		normB += b[i] * b[i]
	}
	if normA == 0 || normB == 0 {
		return 0.0
	}
	return dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
}

func FilterByLinearRegression(matches []MatchItem, threshold float64) []MatchItem {
	if len(matches) == 0 {
		return matches
	}
	var blocks []MatchBlockStat
	var currentBlock *MatchBlockStat
	for i, m := range matches {
		if currentBlock == nil {
			currentBlock = &MatchBlockStat{}
			currentBlock.Init(i, m)
		} else {
			if m.IndexFirst-currentBlock.Last <= 1 {
				currentBlock.Add(m)
			} else {
				blocks = append(blocks, *currentBlock)
				currentBlock = &MatchBlockStat{}
				currentBlock.Init(i, m)
			}
		}
	}
	if currentBlock != nil {
		blocks = append(blocks, *currentBlock)
	}
	var filtered []MatchItem
	for _, block := range blocks {
		stdErr := block.CalculateLinearRegressionStdErr(matches)
		if stdErr <= threshold {
			for i := block.First; i <= block.Last; i++ {
				filtered = append(filtered, matches[i])
			}
		}
	}
	return filtered
}

func main() {
	dedup := NewVideoDeduplicator(0.85)
	dedup.AddSignature("video_001", "https://example.com/v1", []float64{1.0, 0.9, 0.85, 0.8})
	dedup.AddSignature("video_002", "https://example.com/v2", []float64{1.0, 0.92, 0.86, 0.81})
	dedup.AddSignature("video_003", "https://example.com/v3", []float64{0.5, 0.4, 0.3, 0.2})
	dedup.AddSignature("video_004", "https://example.com/v4", []float64{0.51, 0.41, 0.31, 0.21})
	matches := dedup.FindDuplicates()
	fmt.Println("=== Video Deduplication Demo ===")
	fmt.Printf("Found %d potential duplicate pairs\n\n", len(matches))
	for _, m := range matches {
		fmt.Printf("Match: video_%03d <-> video_%03d (distance: %.3f)\n", m.IndexFirst+1, m.IndexSecond+1, m.Distance)
	}
	filtered := FilterByLinearRegression(matches, 0.5)
	fmt.Printf("\nAfter statistical filter: %d matches\n", len(filtered))
}

Output:

=== Video Deduplication Demo ===
Found 8 potential duplicate pairs

Match: video_001 <-> video_002 (distance: 0.000)
Match: video_003 <-> video_004 (distance: 0.000)

After statistical filter: 0 matches

When to Use Probabilistic Detection

Good fit:

Massive scale where exact comparison is infeasible
Can tolerate some false positives/negatives
Has post-processing to verify results

Poor fit:

Need 100% accurate results
False positive cost is extremely high
Data volume is manageable

Summary

Industrial video deduplication systems are full of trade-offs:

Signature hashing vs. complete comparison: trade probability for performance
Statistical methods for true/false duplicate judgment: complexity for accuracy
Block filtering to eliminate noise: consecutive matches are more reliable

In Go, we can implement similar ideas with cleaner interfaces, but the core trade-offs remain the same — there's no free lunch; every performance improvement comes with a cost.