python:compare
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| python:compare [2014/07/21 05:28] – [diff 2 files] admin | python:compare [2022/10/29 16:15] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 7: | Line 7: | ||
| * **find_longest_match** | * **find_longest_match** | ||
| * **get_matching_blocks** | * **get_matching_blocks** | ||
| - | The function **get_opcodes** using above these functions for parsing | + | The function **get_opcodes** using above these functions for parsing\\ |
| + | Create SequenceMatcher with input are two **strings or two lists** | ||
| === match ratio === | === match ratio === | ||
| * Calculate match ratio of two strings:< | * Calculate match ratio of two strings:< | ||
| Line 91: | Line 92: | ||
| a[9] and b[9] match for 0 elements | a[9] and b[9] match for 0 elements | ||
| </ | </ | ||
| - | === Math string with multilines | + | === get_opcodes |
| <code python> | <code python> | ||
| import difflib | import difflib | ||
| + | import sys | ||
| + | |||
| + | a = """ | ||
| + | abc pq | ||
| + | ef abc | ||
| + | mn | ||
| + | """ | ||
| + | b = """ | ||
| + | ef | ||
| + | mn | ||
| + | """ | ||
| + | print 'a = ', a | ||
| + | print 'b = ', b | ||
| + | seq = difflib.SequenceMatcher(None, | ||
| + | print ' | ||
| + | for tag, alo, ahi, blo, bhi in seq.get_opcodes(): | ||
| + | print '- ', tag, alo, ahi, blo, bhi, ':' | ||
| + | print ' | ||
| + | for i in range(alo, ahi): | ||
| + | sys.stdout.writelines(a[i]) | ||
| + | print ' | ||
| + | for i in range(blo, bhi): | ||
| + | sys.stdout.writelines(b[i]) | ||
| + | result = list(difflib.ndiff(a, | ||
| + | print ' | ||
| + | print ' | ||
| + | sys.stdout.writelines(result) | ||
| + | </ | ||
| + | a = [' abcd\n', | ||
| + | b = ['abcd abcd\n', | ||
| + | ******************************* | ||
| + | - replace 0 3 0 2 : | ||
| + | --from: | ||
| + | abcd | ||
| + | abc pq | ||
| + | ef abc | ||
| + | --to: | ||
| + | abcd abcd | ||
| + | ef | ||
| + | - equal 3 4 2 3 : | ||
| + | --from: | ||
| + | mn | ||
| + | --to: | ||
| + | mn | ||
| + | ******************************* | ||
| + | normal diff: | ||
| + | - abcd | ||
| + | + abcd abcd | ||
| + | ? ++++ | ||
| + | + ef | ||
| + | - abc pq | ||
| + | - ef abc | ||
| + | mn | ||
| + | </ | ||
| + | === Match string with multilines === | ||
| + | <code python> | ||
| + | import difflib | ||
| + | import sys | ||
| a = """ | a = """ | ||
| abc pq | abc pq | ||
| ef abc | ef abc | ||
| + | mn | ||
| """ | """ | ||
| b = """ | b = """ | ||
| ef | ef | ||
| + | mn | ||
| """ | """ | ||
| seq = difflib.SequenceMatcher(None, | seq = difflib.SequenceMatcher(None, | ||
| rate = seq.ratio() * 100 | rate = seq.ratio() * 100 | ||
| - | print 'rate: ', | + | print '*************************' |
| - | print 'longest_match: ', seq.find_longest_match(0, | + | print 'rate1: ', |
| - | print ' | + | print 'longest_match1: ', seq.find_longest_match(0, |
| + | print ' | ||
| for block in seq.get_matching_blocks(): | for block in seq.get_matching_blocks(): | ||
| print "a[%d] and b[%d] match for %d elements" | print "a[%d] and b[%d] match for %d elements" | ||
| print '>>>>', | print '>>>>', | ||
| - | print '<<<<', | + | print '<<<<', |
| - | rate: | + | |
| - | longest_match: Match(a=0, b=4, size=5) | + | a = a.splitlines(1) |
| - | matching | + | b = b.splitlines(1) |
| + | seq2 = difflib.SequenceMatcher(None, | ||
| + | rate = seq.ratio() * 100 | ||
| + | print ' | ||
| + | print ' | ||
| + | print ' | ||
| + | print ' | ||
| + | for block in seq2.get_matching_blocks(): | ||
| + | print "a[%d] and b[%d] match for %d elements" | ||
| + | print '>>>>', | ||
| + | print '<<<<', | ||
| + | |||
| + | |||
| + | d = difflib.Differ() | ||
| + | result = list(d.compare(a, | ||
| + | print ' | ||
| + | sys.stdout.writelines(result)</ | ||
| + | ************************* | ||
| + | rate1: | ||
| + | longest_match1: Match(a=0, b=4, size=5) | ||
| + | matching | ||
| a[0] and b[4] match for 5 elements | a[0] and b[4] match for 5 elements | ||
| >>>> | >>>> | ||
| Line 123: | Line 205: | ||
| <<<< | <<<< | ||
| ef | ef | ||
| - | a[20] and b[12] match for 1 elements | + | a[20] and b[12] match for 4 elements |
| >>>> | >>>> | ||
| + | mn | ||
| <<<< | <<<< | ||
| + | mn | ||
| - | a[21] and b[13] match for 0 elements | + | a[24] and b[16] match for 0 elements |
| >>>> | >>>> | ||
| <<<< | <<<< | ||
| + | ************************* | ||
| + | rate2: | ||
| + | longest_match2: | ||
| + | matching blocks2: | ||
| + | a[3] and b[2] match for 1 elements | ||
| + | >>>> | ||
| + | <<<< | ||
| + | a[4] and b[3] match for 0 elements | ||
| + | >>>> | ||
| + | <<<< | ||
| + | normal diff: | ||
| + | + abcd abcd | ||
| + | + ef | ||
| + | - abcd | ||
| + | - abc pq | ||
| + | - ef abc | ||
| + | mn | ||
| </ | </ | ||
| === SequenceMatcher with files === | === SequenceMatcher with files === | ||
| Line 247: | Line 348: | ||
| with open(htmlfile1, | with open(htmlfile1, | ||
| content = f.read() | content = f.read() | ||
| - | content = re.sub(' | + | content = re.sub(' |
| - | content = re.sub(' | + | content = re.sub(' |
| - | content = re.sub(' | + | content = re.sub('> |
| + | content = re.sub(' | ||
| doc1 = content.splitlines(1) | doc1 = content.splitlines(1) | ||
| with open(htmlfile2, | with open(htmlfile2, | ||
| content = f.read() | content = f.read() | ||
| - | content = re.sub(' | + | content = re.sub(' |
| - | content = re.sub(' | + | content = re.sub(' |
| - | content = re.sub(' | + | content = re.sub('> |
| + | content = re.sub(' | ||
| doc2 = content.splitlines(1) | doc2 = content.splitlines(1) | ||
| - | with open(' | ||
| - | for line in doc2: | ||
| - | f.writelines(line) | ||
| result = difflib.ndiff(doc1, | result = difflib.ndiff(doc1, | ||
| with open(' | with open(' | ||
| for line in result: | for line in result: | ||
| f.writelines(line) | f.writelines(line) | ||
| - | with open(' | + | </ |
| - | | + | ==== lxml.html.diff for comparing HTML files ==== |
| - | f.writelines(line) | + | xml.html.diff using 2 basic libraries: |
| + | * difflib for comparing 2 files | ||
| + | * etree for parsing HTML | ||
| + | Examples for lxml.html.diff: | ||
| + | * Simple diff:< | ||
| + | from os import path | ||
| + | import sys, re | ||
| + | from lxml.html import diff, etree, HTMLParser | ||
| + | import codecs | ||
| + | import StringIO | ||
| + | doc1 = '''< | ||
| + | <div class=" | ||
| + | <a title=" | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | <div class=" | ||
| + | <a class=" | ||
| + | <i class=" | ||
| + | < | ||
| + | </ | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | doc2 = '''< | ||
| + | <div class=" | ||
| + | <a title=" | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | <div class=" | ||
| + | <a class=" | ||
| + | <i class=" | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | </ | ||
| + | diffcontent = diff.htmldiff(doc1, | ||
| + | diffcontent = codecs.encode(diffcontent, | ||
| + | print diffcontent | ||
| + | </ | ||
| + | <div class=" | ||
| + | </ | ||
| + | * diff 2 HTML files:< | ||
| + | from os import path | ||
| + | import sys, re | ||
| + | from lxml.html import diff | ||
| + | import codecs | ||
| + | |||
| + | INPUT_DIR = ' | ||
| + | htmlfile1 = path.join(INPUT_DIR, | ||
| + | htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html' | ||
| + | with open(htmlfile1, | ||
| + | | ||
| + | | ||
| + | with open(htmlfile2, | ||
| + | | ||
| + | doc2 = content | ||
| + | diffcontent = diff.htmldiff(doc1, | ||
| + | diffcontent = codecs.encode(diffcontent, | ||
| + | print diffcontent | ||
| </ | </ | ||
| ===== filecmp ===== | ===== filecmp ===== | ||
python/compare.1405920518.txt.gz · Last modified: (external edit)
