python:compare
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| python:compare [2014/07/21 02:53] – [diff 2 files] admin | python:compare [2022/10/29 16:15] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 7: | Line 7: | ||
| * **find_longest_match** | * **find_longest_match** | ||
| * **get_matching_blocks** | * **get_matching_blocks** | ||
| - | The function **get_opcodes** using above these functions for parsing | + | The function **get_opcodes** using above these functions for parsing\\ |
| + | Create SequenceMatcher with input are two **strings or two lists** | ||
| === match ratio === | === match ratio === | ||
| * Calculate match ratio of two strings:< | * Calculate match ratio of two strings:< | ||
| Line 27: | Line 28: | ||
| find_longest_match(alo, | find_longest_match(alo, | ||
| </ | </ | ||
| - | * Example <code python> | + | * Simple |
| import difflib | import difflib | ||
| Line 45: | Line 46: | ||
| Match(a=0, b=4, size=5) | Match(a=0, b=4, size=5) | ||
| Match(a=1, b=4, size=5) | Match(a=1, b=4, size=5) | ||
| + | </ | ||
| + | * Example find_longest_match with isjunk option:< | ||
| + | import difflib | ||
| + | |||
| + | a = ' abcd' | ||
| + | b = 'abcd abcd' | ||
| + | |||
| + | seq = difflib.SequenceMatcher(None, | ||
| + | seq2 = difflib.SequenceMatcher(lambda x: x==" ", a, b) | ||
| + | seq3 = difflib.SequenceMatcher(difflib.IS_LINE_JUNK, | ||
| + | |||
| + | print seq.find_longest_match(0, | ||
| + | print seq2.find_longest_match(0, | ||
| + | print seq3.find_longest_match(0, | ||
| + | </ | ||
| + | Match(a=0, b=4, size=5) | ||
| + | Match(a=1, b=0, size=4) | ||
| + | Match(a=1, b=0, size=4) | ||
| </ | </ | ||
| === Get matching blocks === | === Get matching blocks === | ||
| Line 73: | Line 92: | ||
| a[9] and b[9] match for 0 elements | a[9] and b[9] match for 0 elements | ||
| </ | </ | ||
| - | === Math string with multilines | + | === get_opcodes |
| <code python> | <code python> | ||
| import difflib | import difflib | ||
| + | import sys | ||
| + | |||
| + | a = """ | ||
| + | abc pq | ||
| + | ef abc | ||
| + | mn | ||
| + | """ | ||
| + | b = """ | ||
| + | ef | ||
| + | mn | ||
| + | """ | ||
| + | print 'a = ', a | ||
| + | print 'b = ', b | ||
| + | seq = difflib.SequenceMatcher(None, | ||
| + | print ' | ||
| + | for tag, alo, ahi, blo, bhi in seq.get_opcodes(): | ||
| + | print '- ', tag, alo, ahi, blo, bhi, ':' | ||
| + | print ' | ||
| + | for i in range(alo, ahi): | ||
| + | sys.stdout.writelines(a[i]) | ||
| + | print ' | ||
| + | for i in range(blo, bhi): | ||
| + | sys.stdout.writelines(b[i]) | ||
| + | result = list(difflib.ndiff(a, | ||
| + | print ' | ||
| + | print ' | ||
| + | sys.stdout.writelines(result) | ||
| + | </ | ||
| + | a = [' abcd\n', | ||
| + | b = ['abcd abcd\n', | ||
| + | ******************************* | ||
| + | - replace 0 3 0 2 : | ||
| + | --from: | ||
| + | abcd | ||
| + | abc pq | ||
| + | ef abc | ||
| + | --to: | ||
| + | abcd abcd | ||
| + | ef | ||
| + | - equal 3 4 2 3 : | ||
| + | --from: | ||
| + | mn | ||
| + | --to: | ||
| + | mn | ||
| + | ******************************* | ||
| + | normal diff: | ||
| + | - abcd | ||
| + | + abcd abcd | ||
| + | ? ++++ | ||
| + | + ef | ||
| + | - abc pq | ||
| + | - ef abc | ||
| + | mn | ||
| + | </ | ||
| + | === Match string with multilines === | ||
| + | <code python> | ||
| + | import difflib | ||
| + | import sys | ||
| a = """ | a = """ | ||
| abc pq | abc pq | ||
| ef abc | ef abc | ||
| + | mn | ||
| """ | """ | ||
| b = """ | b = """ | ||
| ef | ef | ||
| + | mn | ||
| """ | """ | ||
| seq = difflib.SequenceMatcher(None, | seq = difflib.SequenceMatcher(None, | ||
| rate = seq.ratio() * 100 | rate = seq.ratio() * 100 | ||
| - | print 'rate: ', | + | print '*************************' |
| - | print 'longest_match: ', seq.find_longest_match(0, | + | print 'rate1: ', |
| - | print ' | + | print 'longest_match1: ', seq.find_longest_match(0, |
| + | print ' | ||
| for block in seq.get_matching_blocks(): | for block in seq.get_matching_blocks(): | ||
| print "a[%d] and b[%d] match for %d elements" | print "a[%d] and b[%d] match for %d elements" | ||
| print '>>>>', | print '>>>>', | ||
| - | print '<<<<', | + | print '<<<<', |
| - | rate: | + | |
| - | longest_match: Match(a=0, b=4, size=5) | + | a = a.splitlines(1) |
| - | matching | + | b = b.splitlines(1) |
| + | seq2 = difflib.SequenceMatcher(None, | ||
| + | rate = seq.ratio() * 100 | ||
| + | print ' | ||
| + | print ' | ||
| + | print ' | ||
| + | print ' | ||
| + | for block in seq2.get_matching_blocks(): | ||
| + | print "a[%d] and b[%d] match for %d elements" | ||
| + | print '>>>>', | ||
| + | print '<<<<', | ||
| + | |||
| + | |||
| + | d = difflib.Differ() | ||
| + | result = list(d.compare(a, | ||
| + | print ' | ||
| + | sys.stdout.writelines(result)</ | ||
| + | ************************* | ||
| + | rate1: | ||
| + | longest_match1: Match(a=0, b=4, size=5) | ||
| + | matching | ||
| a[0] and b[4] match for 5 elements | a[0] and b[4] match for 5 elements | ||
| >>>> | >>>> | ||
| Line 105: | Line 205: | ||
| <<<< | <<<< | ||
| ef | ef | ||
| - | a[20] and b[12] match for 1 elements | + | a[20] and b[12] match for 4 elements |
| >>>> | >>>> | ||
| + | mn | ||
| <<<< | <<<< | ||
| + | mn | ||
| - | a[21] and b[13] match for 0 elements | + | a[24] and b[16] match for 0 elements |
| >>>> | >>>> | ||
| <<<< | <<<< | ||
| + | ************************* | ||
| + | rate2: | ||
| + | longest_match2: | ||
| + | matching blocks2: | ||
| + | a[3] and b[2] match for 1 elements | ||
| + | >>>> | ||
| + | <<<< | ||
| + | a[4] and b[3] match for 0 elements | ||
| + | >>>> | ||
| + | <<<< | ||
| + | normal diff: | ||
| + | + abcd abcd | ||
| + | + ef | ||
| + | - abcd | ||
| + | - abc pq | ||
| + | - ef abc | ||
| + | mn | ||
| </ | </ | ||
| === SequenceMatcher with files === | === SequenceMatcher with files === | ||
| Line 153: | Line 272: | ||
| from pprint import pprint | from pprint import pprint | ||
| import sys | import sys | ||
| + | |||
| a = """ | a = """ | ||
| abc pq | abc pq | ||
| ef abc | ef abc | ||
| + | mpq | ||
| """ | """ | ||
| b = """ | b = """ | ||
| abc pq | abc pq | ||
| ef | ef | ||
| + | mpq | ||
| """ | """ | ||
| - | + | ||
| d = difflib.Differ() | d = difflib.Differ() | ||
| result = list(d.compare(a, | result = list(d.compare(a, | ||
| + | print ' | ||
| sys.stdout.writelines(result) | sys.stdout.writelines(result) | ||
| - | </ | + | |
| + | print 'diff with charjunk = difflib.IS_CHARACTER_JUNK:' | ||
| + | result = difflib.ndiff(a, | ||
| + | sys.stdout.writelines(result)</ | ||
| - abcd | - abcd | ||
| + abcd abcd | + abcd abcd | ||
| Line 211: | Line 336: | ||
| for line in result: | for line in result: | ||
| f.writelines(line) | f.writelines(line) | ||
| + | </ | ||
| + | * Compare 2 html files:< | ||
| + | import difflib | ||
| + | from os import path | ||
| + | from pprint import pprint | ||
| + | import sys, re | ||
| + | |||
| + | INPUT_DIR = ' | ||
| + | htmlfile1 = path.join(INPUT_DIR, | ||
| + | htmlfile2 = path.join(INPUT_DIR, | ||
| + | with open(htmlfile1, | ||
| + | content = f.read() | ||
| + | content = re.sub(' | ||
| + | content = re.sub(' | ||
| + | content = re.sub('> | ||
| + | content = re.sub(' | ||
| + | doc1 = content.splitlines(1) | ||
| + | with open(htmlfile2, | ||
| + | content = f.read() | ||
| + | content = re.sub(' | ||
| + | content = re.sub(' | ||
| + | content = re.sub('> | ||
| + | content = re.sub(' | ||
| + | doc2 = content.splitlines(1) | ||
| + | |||
| + | result = difflib.ndiff(doc1, | ||
| + | with open(' | ||
| + | for line in result: | ||
| + | f.writelines(line) | ||
| + | </ | ||
| + | ==== lxml.html.diff for comparing HTML files ==== | ||
| + | xml.html.diff using 2 basic libraries: | ||
| + | * difflib for comparing 2 files | ||
| + | * etree for parsing HTML | ||
| + | Examples for lxml.html.diff: | ||
| + | * Simple diff:< | ||
| + | from os import path | ||
| + | import sys, re | ||
| + | from lxml.html import diff, etree, HTMLParser | ||
| + | import codecs | ||
| + | import StringIO | ||
| + | doc1 = '''< | ||
| + | <div class=" | ||
| + | <a title=" | ||
| + | < | ||
| + | </a> | ||
| + | </ | ||
| + | <div class=" | ||
| + | <a class=" | ||
| + | <i class=" | ||
| + | < | ||
| + | </a> | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | doc2 = '''< | ||
| + | <div class=" | ||
| + | <a title=" | ||
| + | < | ||
| + | </a> | ||
| + | </ | ||
| + | <div class=" | ||
| + | <a class=" | ||
| + | <i class=" | ||
| + | < | ||
| + | </a> | ||
| + | </ | ||
| + | </ | ||
| + | diffcontent = diff.htmldiff(doc1, | ||
| + | diffcontent = codecs.encode(diffcontent, | ||
| + | print diffcontent | ||
| + | </ | ||
| + | <div class=" | ||
| + | </ | ||
| + | * diff 2 HTML files:< | ||
| + | from os import path | ||
| + | import sys, re | ||
| + | from lxml.html import diff | ||
| + | import codecs | ||
| + | |||
| + | INPUT_DIR = ' | ||
| + | htmlfile1 = path.join(INPUT_DIR, | ||
| + | htmlfile2 = path.join(INPUT_DIR, | ||
| + | with open(htmlfile1, | ||
| + | content = f.read() | ||
| + | doc1 = content | ||
| + | with open(htmlfile2, | ||
| + | content = f.read() | ||
| + | doc2 = content | ||
| + | diffcontent = diff.htmldiff(doc1, | ||
| + | diffcontent = codecs.encode(diffcontent, | ||
| + | print diffcontent | ||
| </ | </ | ||
| ===== filecmp ===== | ===== filecmp ===== | ||
python/compare.1405911235.txt.gz · Last modified: (external edit)
