python:compare
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
python:compare [2014/07/21 02:45] – [Finding Matching String with SequenceMatcher] admin | python:compare [2022/10/29 16:15] (current) – external edit 127.0.0.1 | ||
---|---|---|---|
Line 7: | Line 7: | ||
* **find_longest_match** | * **find_longest_match** | ||
* **get_matching_blocks** | * **get_matching_blocks** | ||
- | The function **get_opcodes** using above these functions for parsing | + | The function **get_opcodes** using above these functions for parsing\\ |
+ | Create SequenceMatcher with input are two **strings or two lists** | ||
=== match ratio === | === match ratio === | ||
* Calculate match ratio of two strings:< | * Calculate match ratio of two strings:< | ||
Line 27: | Line 28: | ||
find_longest_match(alo, | find_longest_match(alo, | ||
</ | </ | ||
- | * Example <code python> | + | * Simple |
import difflib | import difflib | ||
Line 45: | Line 46: | ||
Match(a=0, b=4, size=5) | Match(a=0, b=4, size=5) | ||
Match(a=1, b=4, size=5) | Match(a=1, b=4, size=5) | ||
+ | </ | ||
+ | * Example find_longest_match with isjunk option:< | ||
+ | import difflib | ||
+ | |||
+ | a = ' abcd' | ||
+ | b = 'abcd abcd' | ||
+ | |||
+ | seq = difflib.SequenceMatcher(None, | ||
+ | seq2 = difflib.SequenceMatcher(lambda x: x==" ", a, b) | ||
+ | seq3 = difflib.SequenceMatcher(difflib.IS_LINE_JUNK, | ||
+ | |||
+ | print seq.find_longest_match(0, | ||
+ | print seq2.find_longest_match(0, | ||
+ | print seq3.find_longest_match(0, | ||
+ | </ | ||
+ | Match(a=0, b=4, size=5) | ||
+ | Match(a=1, b=0, size=4) | ||
+ | Match(a=1, b=0, size=4) | ||
</ | </ | ||
=== Get matching blocks === | === Get matching blocks === | ||
Line 73: | Line 92: | ||
a[9] and b[9] match for 0 elements | a[9] and b[9] match for 0 elements | ||
</ | </ | ||
- | === Math string with multilines | + | === get_opcodes |
<code python> | <code python> | ||
import difflib | import difflib | ||
+ | import sys | ||
+ | |||
+ | a = """ | ||
+ | abc pq | ||
+ | ef abc | ||
+ | mn | ||
+ | """ | ||
+ | b = """ | ||
+ | ef | ||
+ | mn | ||
+ | """ | ||
+ | print 'a = ', a | ||
+ | print 'b = ', b | ||
+ | seq = difflib.SequenceMatcher(None, | ||
+ | print ' | ||
+ | for tag, alo, ahi, blo, bhi in seq.get_opcodes(): | ||
+ | print '- ', tag, alo, ahi, blo, bhi, ':' | ||
+ | print ' | ||
+ | for i in range(alo, ahi): | ||
+ | sys.stdout.writelines(a[i]) | ||
+ | print ' | ||
+ | for i in range(blo, bhi): | ||
+ | sys.stdout.writelines(b[i]) | ||
+ | result = list(difflib.ndiff(a, | ||
+ | print ' | ||
+ | print ' | ||
+ | sys.stdout.writelines(result) | ||
+ | </ | ||
+ | a = [' abcd\n', | ||
+ | b = ['abcd abcd\n', | ||
+ | ******************************* | ||
+ | - replace 0 3 0 2 : | ||
+ | --from: | ||
+ | abcd | ||
+ | abc pq | ||
+ | ef abc | ||
+ | --to: | ||
+ | abcd abcd | ||
+ | ef | ||
+ | - equal 3 4 2 3 : | ||
+ | --from: | ||
+ | mn | ||
+ | --to: | ||
+ | mn | ||
+ | ******************************* | ||
+ | normal diff: | ||
+ | - abcd | ||
+ | + abcd abcd | ||
+ | ? ++++ | ||
+ | + ef | ||
+ | - abc pq | ||
+ | - ef abc | ||
+ | mn | ||
+ | </ | ||
+ | === Match string with multilines === | ||
+ | <code python> | ||
+ | import difflib | ||
+ | import sys | ||
a = """ | a = """ | ||
abc pq | abc pq | ||
ef abc | ef abc | ||
+ | mn | ||
""" | """ | ||
b = """ | b = """ | ||
ef | ef | ||
+ | mn | ||
""" | """ | ||
seq = difflib.SequenceMatcher(None, | seq = difflib.SequenceMatcher(None, | ||
rate = seq.ratio() * 100 | rate = seq.ratio() * 100 | ||
- | print 'rate: ', | + | print '*************************' |
- | print 'longest_match: ', seq.find_longest_match(0, | + | print 'rate1: ', |
- | print ' | + | print 'longest_match1: ', seq.find_longest_match(0, |
+ | print ' | ||
for block in seq.get_matching_blocks(): | for block in seq.get_matching_blocks(): | ||
print "a[%d] and b[%d] match for %d elements" | print "a[%d] and b[%d] match for %d elements" | ||
print '>>>>', | print '>>>>', | ||
- | print '<<<<', | + | print '<<<<', |
- | rate: | + | |
- | longest_match: Match(a=0, b=4, size=5) | + | a = a.splitlines(1) |
- | matching | + | b = b.splitlines(1) |
+ | seq2 = difflib.SequenceMatcher(None, | ||
+ | rate = seq.ratio() * 100 | ||
+ | print ' | ||
+ | print ' | ||
+ | print ' | ||
+ | print ' | ||
+ | for block in seq2.get_matching_blocks(): | ||
+ | print "a[%d] and b[%d] match for %d elements" | ||
+ | print '>>>>', | ||
+ | print '<<<<', | ||
+ | |||
+ | |||
+ | d = difflib.Differ() | ||
+ | result = list(d.compare(a, | ||
+ | print ' | ||
+ | sys.stdout.writelines(result)</ | ||
+ | ************************* | ||
+ | rate1: | ||
+ | longest_match1: Match(a=0, b=4, size=5) | ||
+ | matching | ||
a[0] and b[4] match for 5 elements | a[0] and b[4] match for 5 elements | ||
>>>> | >>>> | ||
Line 105: | Line 205: | ||
<<<< | <<<< | ||
ef | ef | ||
- | a[20] and b[12] match for 1 elements | + | a[20] and b[12] match for 4 elements |
>>>> | >>>> | ||
+ | mn | ||
<<<< | <<<< | ||
+ | mn | ||
- | a[21] and b[13] match for 0 elements | + | a[24] and b[16] match for 0 elements |
>>>> | >>>> | ||
<<<< | <<<< | ||
+ | ************************* | ||
+ | rate2: | ||
+ | longest_match2: | ||
+ | matching blocks2: | ||
+ | a[3] and b[2] match for 1 elements | ||
+ | >>>> | ||
+ | <<<< | ||
+ | a[4] and b[3] match for 0 elements | ||
+ | >>>> | ||
+ | <<<< | ||
+ | normal diff: | ||
+ | + abcd abcd | ||
+ | + ef | ||
+ | - abcd | ||
+ | - abc pq | ||
+ | - ef abc | ||
+ | mn | ||
</ | </ | ||
=== SequenceMatcher with files === | === SequenceMatcher with files === | ||
Line 153: | Line 272: | ||
from pprint import pprint | from pprint import pprint | ||
import sys | import sys | ||
+ | |||
a = """ | a = """ | ||
abc pq | abc pq | ||
ef abc | ef abc | ||
+ | mpq | ||
""" | """ | ||
b = """ | b = """ | ||
abc pq | abc pq | ||
ef | ef | ||
+ | mpq | ||
""" | """ | ||
- | + | ||
d = difflib.Differ() | d = difflib.Differ() | ||
result = list(d.compare(a, | result = list(d.compare(a, | ||
+ | print ' | ||
sys.stdout.writelines(result) | sys.stdout.writelines(result) | ||
- | </ | + | |
+ | print 'diff with charjunk = difflib.IS_CHARACTER_JUNK:' | ||
+ | result = difflib.ndiff(a, | ||
+ | sys.stdout.writelines(result)</ | ||
- abcd | - abcd | ||
+ abcd abcd | + abcd abcd | ||
Line 183: | Line 308: | ||
htmlfile2 = path.join(INPUT_DIR, | htmlfile2 = path.join(INPUT_DIR, | ||
with open(htmlfile1, | with open(htmlfile1, | ||
- | doc1 = f.read().splitlines() | + | doc1 = f.read().splitlines(1) |
with open(htmlfile2, | with open(htmlfile2, | ||
- | doc2 = f.read().splitlines() | + | doc2 = f.read().splitlines(1) |
d = difflib.Differ() | d = difflib.Differ() | ||
Line 203: | Line 328: | ||
htmlfile2 = path.join(INPUT_DIR, | htmlfile2 = path.join(INPUT_DIR, | ||
with open(htmlfile1, | with open(htmlfile1, | ||
- | doc1 = f.read().splitlines() | + | doc1 = f.read().splitlines(1) |
with open(htmlfile2, | with open(htmlfile2, | ||
- | doc2 = f.read().splitlines() | + | doc2 = f.read().splitlines(1) |
result = difflib.ndiff(doc1, | result = difflib.ndiff(doc1, | ||
Line 211: | Line 336: | ||
for line in result: | for line in result: | ||
f.writelines(line) | f.writelines(line) | ||
+ | </ | ||
+ | * Compare 2 html files:< | ||
+ | import difflib | ||
+ | from os import path | ||
+ | from pprint import pprint | ||
+ | import sys, re | ||
+ | |||
+ | INPUT_DIR = ' | ||
+ | htmlfile1 = path.join(INPUT_DIR, | ||
+ | htmlfile2 = path.join(INPUT_DIR, | ||
+ | with open(htmlfile1, | ||
+ | content = f.read() | ||
+ | content = re.sub(' | ||
+ | content = re.sub(' | ||
+ | content = re.sub('> | ||
+ | content = re.sub(' | ||
+ | doc1 = content.splitlines(1) | ||
+ | with open(htmlfile2, | ||
+ | content = f.read() | ||
+ | content = re.sub(' | ||
+ | content = re.sub(' | ||
+ | content = re.sub('> | ||
+ | content = re.sub(' | ||
+ | doc2 = content.splitlines(1) | ||
+ | |||
+ | result = difflib.ndiff(doc1, | ||
+ | with open(' | ||
+ | for line in result: | ||
+ | f.writelines(line) | ||
+ | </ | ||
+ | ==== lxml.html.diff for comparing HTML files ==== | ||
+ | xml.html.diff using 2 basic libraries: | ||
+ | * difflib for comparing 2 files | ||
+ | * etree for parsing HTML | ||
+ | Examples for lxml.html.diff: | ||
+ | * Simple diff:< | ||
+ | from os import path | ||
+ | import sys, re | ||
+ | from lxml.html import diff, etree, HTMLParser | ||
+ | import codecs | ||
+ | import StringIO | ||
+ | doc1 = '''< | ||
+ | <div class=" | ||
+ | <a title=" | ||
+ | < | ||
+ | </a> | ||
+ | </ | ||
+ | <div class=" | ||
+ | <a class=" | ||
+ | <i class=" | ||
+ | < | ||
+ | </a> | ||
+ | < | ||
+ | </ | ||
+ | </ | ||
+ | doc2 = '''< | ||
+ | <div class=" | ||
+ | <a title=" | ||
+ | < | ||
+ | </a> | ||
+ | </ | ||
+ | <div class=" | ||
+ | <a class=" | ||
+ | <i class=" | ||
+ | < | ||
+ | </a> | ||
+ | </ | ||
+ | </ | ||
+ | diffcontent = diff.htmldiff(doc1, | ||
+ | diffcontent = codecs.encode(diffcontent, | ||
+ | print diffcontent | ||
+ | </ | ||
+ | <div class=" | ||
+ | </ | ||
+ | * diff 2 HTML files:< | ||
+ | from os import path | ||
+ | import sys, re | ||
+ | from lxml.html import diff | ||
+ | import codecs | ||
+ | |||
+ | INPUT_DIR = ' | ||
+ | htmlfile1 = path.join(INPUT_DIR, | ||
+ | htmlfile2 = path.join(INPUT_DIR, | ||
+ | with open(htmlfile1, | ||
+ | content = f.read() | ||
+ | doc1 = content | ||
+ | with open(htmlfile2, | ||
+ | content = f.read() | ||
+ | doc2 = content | ||
+ | diffcontent = diff.htmldiff(doc1, | ||
+ | diffcontent = codecs.encode(diffcontent, | ||
+ | print diffcontent | ||
</ | </ | ||
===== filecmp ===== | ===== filecmp ===== |
python/compare.1405910751.txt.gz · Last modified: 2022/10/29 16:15 (external edit)