This module Difflib provides classes and functions for comparing sequences. It can be used for example, for comparing files, and can produce difference information in various formats, including HTML and context and unified diffs. For comparing directories and files, see also, the filecmp module.
class difflib.SequenceMatcher The idea is to find the longest contiguous matching subsequence that contains no “junk” elements (the Ratcliff and Obershelp algorithm doesn’t address junk). The most basic functions:
The function get_opcodes using above these functions for parsing
Create SequenceMatcher with input are two strings or two lists
import difflib a = ' abcd' b = 'abcd abcd' seq = difflib.SequenceMatcher(None, a, b) rate = seq.ratio() * 100 print rate
⇒ output:
71.4285714286
find_longest_match(alo, ahi, blo, bhi)
Find longest matching block in a[alo:ahi] and b[blo:bhi].(lo: low, hi: high). Returns (i, j, k) such that a[i:i+k] is equal to b[j:j+k] with longest match ratio
import difflib a = ' abcd' b = 'abcd abcd' seq = difflib.SequenceMatcher(None, a, b) rate = seq.ratio() * 100 print rate print seq.find_longest_match(0, 5, 0, 9) a = 'm abcd' seq.set_seq1(a) print seq.find_longest_match(0, 6, 0, 9)
⇒ output:
71.4285714286 Match(a=0, b=4, size=5) Match(a=1, b=4, size=5)
import difflib a = ' abcd' b = 'abcd abcd' seq = difflib.SequenceMatcher(None, a, b) seq2 = difflib.SequenceMatcher(lambda x: x==" ", a, b) seq3 = difflib.SequenceMatcher(difflib.IS_LINE_JUNK, a, b) print seq.find_longest_match(0, 5, 0, 9) print seq2.find_longest_match(0, 5, 0, 9) print seq3.find_longest_match(0, 5, 0, 9)
output:
Match(a=0, b=4, size=5) Match(a=1, b=0, size=4) Match(a=1, b=0, size=4)
import difflib a = ' abcd' b = 'abcd abcd' seq = difflib.SequenceMatcher(None, a, b) rate = seq.ratio() * 100 print 'matching1:' for block in seq.get_matching_blocks(): print "a[%d] and b[%d] match for %d elements" % block a = 'abced abc' seq.set_seq1(a) print 'matching2:' for block in seq.get_matching_blocks(): print "a[%d] and b[%d] match for %d elements" % block
⇒ output:
matching1: a[0] and b[4] match for 5 elements a[5] and b[9] match for 0 elements matching2: a[0] and b[0] match for 3 elements a[4] and b[3] match for 5 elements a[9] and b[9] match for 0 elements
a[0] and b[4] match for 5 elements: 5 elements from a[0] are ' abcd' and 5 elements from b[9] are ' abcd'
import difflib import sys a = """ abcd abc pq ef abc mn """.splitlines(1) b = """abcd abcd ef mn """.splitlines(1) print 'a = ', a print 'b = ', b seq = difflib.SequenceMatcher(None, a, b) print '*******************************' for tag, alo, ahi, blo, bhi in seq.get_opcodes(): print '- ', tag, alo, ahi, blo, bhi, ':' print '--from:' for i in range(alo, ahi): sys.stdout.writelines(a[i]) print '--to:' for i in range(blo, bhi): sys.stdout.writelines(b[i]) result = list(difflib.ndiff(a, b)) print '*******************************' print 'normal diff:' sys.stdout.writelines(result)
output:
a = [' abcd\n', 'abc pq\n', 'ef abc\n', 'mn\n'] b = ['abcd abcd\n', 'ef\n', 'mn\n'] ******************************* - replace 0 3 0 2 : --from: abcd abc pq ef abc --to: abcd abcd ef - equal 3 4 2 3 : --from: mn --to: mn ******************************* normal diff: - abcd + abcd abcd ? ++++ + ef - abc pq - ef abc mn
import difflib import sys a = """ abcd abc pq ef abc mn """ b = """abcd abcd ef mn """ seq = difflib.SequenceMatcher(None, a, b) rate = seq.ratio() * 100 print '*************************' print 'rate1: ',rate print 'longest_match1: ', seq.find_longest_match(0, 20, 0, 9) print 'matching blocks1:' for block in seq.get_matching_blocks(): print "a[%d] and b[%d] match for %d elements" % block print '>>>>', a[block[0]:(block[0] + block[2])] print '<<<<', b[block[1]:(block[1] + block[2])] a = a.splitlines(1) b = b.splitlines(1) seq2 = difflib.SequenceMatcher(None, a, b) rate = seq.ratio() * 100 print '*************************' print 'rate2: ',rate print 'longest_match2: ', seq2.find_longest_match(0, 4, 0, 3) print 'matching blocks2:' for block in seq2.get_matching_blocks(): print "a[%d] and b[%d] match for %d elements" % block print '>>>>', a[block[0]:(block[0] + block[2])] print '<<<<', b[block[1]:(block[1] + block[2])] d = difflib.Differ() result = list(d.compare(a, b)) print 'normal diff:' sys.stdout.writelines(result)
output:
************************* rate1: 60.0 longest_match1: Match(a=0, b=4, size=5) matching blocks1: a[0] and b[4] match for 5 elements >>>> abcd <<<< abcd a[13] and b[9] match for 3 elements >>>> ef <<<< ef a[20] and b[12] match for 4 elements >>>> mn <<<< mn a[24] and b[16] match for 0 elements >>>> <<<< ************************* rate2: 60.0 longest_match2: Match(a=3, b=2, size=1) matching blocks2: a[3] and b[2] match for 1 elements >>>> ['mn\n'] <<<< ['mn\n'] a[4] and b[3] match for 0 elements >>>> [] <<<< [] normal diff: + abcd abcd + ef - abcd - abc pq - ef abc mn
import difflib from os import path INPUT_DIR = 'opencart_47066' htmlfile1 = path.join(INPUT_DIR, 'index.html') htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html') with open(htmlfile1, 'r') as f: doc1 = f.read() with open(htmlfile2, 'r') as f: doc2 = f.read() seq = difflib.SequenceMatcher(None, doc1, doc2) rate = seq.ratio() * 100 print rate for block in seq.get_matching_blocks(): print "a[%d] and b[%d] match for %d elements" % block print '>>>>', doc1[block[0]:(block[0] + block[2])] print '<<<<', doc2[block[1]:(block[1] + block[2])]
Differ object using APIs of SequenceMatcher for comparing:
Understand some basic function:
d = difflib.Differ() result = d.compare(a, b)
def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): return Differ(linejunk, charjunk).compare(a, b)
import difflib from os import path from pprint import pprint import sys a = """ abcd abc pq ef abc mpq """.splitlines(1) b = """abcd abcd abc pq ef mpq """.splitlines(1) d = difflib.Differ() result = list(d.compare(a, b)) print 'normal diff:' sys.stdout.writelines(result) print 'diff with charjunk = difflib.IS_CHARACTER_JUNK:' result = difflib.ndiff(a, b) sys.stdout.writelines(result)
output:
- abcd + abcd abcd abc pq - ef abc + ef
import difflib from os import path from pprint import pprint import sys INPUT_DIR = 'opencart_47066' htmlfile1 = path.join(INPUT_DIR, 'index.html') htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html') with open(htmlfile1, 'r') as f: doc1 = f.read().splitlines(1) with open(htmlfile2, 'r') as f: doc2 = f.read().splitlines(1) d = difflib.Differ() result = d.compare(doc1, doc2) with open('compare.html', 'wb') as f: for line in result: f.writelines(line)
import difflib from os import path from pprint import pprint import sys INPUT_DIR = 'opencart_47066' htmlfile1 = path.join(INPUT_DIR, 'index.html') htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html') with open(htmlfile1, 'r') as f: doc1 = f.read().splitlines(1) with open(htmlfile2, 'r') as f: doc2 = f.read().splitlines(1) result = difflib.ndiff(doc1, doc2) with open('compare.html', 'wb') as f: for line in result: f.writelines(line)
import difflib from os import path from pprint import pprint import sys, re INPUT_DIR = 'opencart_47066' htmlfile1 = path.join(INPUT_DIR, 'index.html') htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html') with open(htmlfile1, 'r') as f: content = f.read() content = re.sub('[\s\t]+/>', '/>', content) content = re.sub('[\s\t]+>', '>', content) content = re.sub('>[\s\t]+<', '>\n<', content) content = re.sub('[\s\t]*\n[\s\t]*', '\n', content) doc1 = content.splitlines(1) with open(htmlfile2, 'r') as f: content = f.read() content = re.sub('[\s\t]+/>', '/>', content) content = re.sub('[\s\t]+>', '>', content) content = re.sub('>[\s\t]+<', '>\n<', content) content = re.sub('[\s\t]*\n[\s\t]*', '\n', content) doc2 = content.splitlines(1) result = difflib.ndiff(doc1, doc2) with open('compare.html', 'wb') as f: for line in result: f.writelines(line)
xml.html.diff using 2 basic libraries:
Examples for lxml.html.diff:
from os import path import sys, re from lxml.html import diff, etree, HTMLParser import codecs import StringIO doc1 = '''<div class="cart-button"> <div class="cart"> <a title="Add to cart" data-id="35;" class="button addToCart-1 "> <span>Add to cart</span> </a> </div> <div class="wishlist"> <a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('35');"> <i class="icon-star"></i> <span>Add to Wish List</span> </a> <b>simple</b> </div> </div>''' doc2 = '''<div class="cart-button"> <div class="cart"> <a title="Add to cart" data-id="35;" class="button addToCart-1 "> <span>Add to cart</span> </a> </div> <div class="wishlist"> <a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('30');"> <i class="icon-star"></i> <span>Add to Wish List change</span> </a> </div> </div>''' diffcontent = diff.htmldiff(doc1, doc2) diffcontent = codecs.encode(diffcontent, 'utf-8') print diffcontent
output:
<div class="cart-button"><div class="cart"><a title="Add to cart" data-id="35;" class="button addToCart-1 "><span>Add to cart</span> </a> </div> <div class="wishlist"><a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('30');"><i class="icon-star"></i> <span>Add to Wish List <ins>change</ins> </span> </a> <del><b>simple</b></del> </div> </div>
from os import path import sys, re from lxml.html import diff import codecs INPUT_DIR = 'opencart_47066' htmlfile1 = path.join(INPUT_DIR, 'index.html') htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html') with open(htmlfile1, 'r') as f: content = f.read() doc1 = content with open(htmlfile2, 'r') as f: content = f.read() doc2 = content diffcontent = diff.htmldiff(doc1, doc2) diffcontent = codecs.encode(diffcontent, 'utf-8') print diffcontent