Differences

This shows you the differences between two versions of the page.

--- python:compare [2014/07/21 02:45] – [Finding Matching String with SequenceMatcher] admin
+++ python:compare [2022/10/29 16:15] (current) – external edit 127.0.0.1
@@ Line 7: / Line 7: @@
   * **find_longest_match**
   * **get_matching_blocks**
-The function **get_opcodes** using above these functions for parsing
+The function **get_opcodes** using above these functions for parsing\\
+Create SequenceMatcher with input are two **strings or two lists**
 === match ratio ===
   * Calculate match ratio of two strings:<code python>
@@ Line 27: / Line 28: @@
 find_longest_match(alo, ahi, blo, bhi)
 </code>Find longest matching block in a[alo:ahi] and b[blo:bhi].(lo: low, hi: high). Returns (i, j, k) such that a[i:i+k] is equal to b[j:j+k] with longest match ratio
-    * Example <code python>
+    * Simple Example <code python>
 import difflib
@@ Line 45: / Line 46: @@
 Match(a=0, b=4, size=5)
 Match(a=1, b=4, size=5)
+</code>
+    * Example find_longest_match with isjunk option:<code python>
+import difflib
+a = ' abcd'
+b = 'abcd abcd'
+seq = difflib.SequenceMatcher(None, a, b)
+seq2 = difflib.SequenceMatcher(lambda x: x==" ", a, b)
+seq3 = difflib.SequenceMatcher(difflib.IS_LINE_JUNK, a, b)
+print seq.find_longest_match(0, 5, 0, 9)
+print seq2.find_longest_match(0, 5, 0, 9)
+print seq3.find_longest_match(0, 5, 0, 9)
+</code>output:<code>
+Match(a=0, b=4, size=5)
+Match(a=1, b=0, size=4)
+Match(a=1, b=0, size=4)
 </code>
 === Get matching blocks ===
@@ Line 73: / Line 92: @@
 a[9] and b[9] match for 0 elements
 </code> **a[0] and b[4] match for 5 elements:** 5 elements from a[0] are ' abcd' and 5 elements from b[9] are ' abcd'
-=== Math string with multilines ===
+=== get_opcodes ===
 <code python>
 import difflib
+import sys
+a = """ abcd
+abc pq
+ef abc
+mn
+""".splitlines(1)
+b = """abcd abcd
+ef
+mn
+""".splitlines(1)
+print 'a = ', a
+print 'b = ', b
+seq = difflib.SequenceMatcher(None, a, b)
+print '*******************************'
+for tag, alo, ahi, blo, bhi in seq.get_opcodes():
+    print '- ', tag, alo, ahi, blo, bhi, ':'
+    print '--from:'
+    for i in range(alo, ahi):
+        sys.stdout.writelines(a[i])
+    print '--to:'
+    for i in range(blo, bhi):
+        sys.stdout.writelines(b[i])
+result = list(difflib.ndiff(a, b))
+print '*******************************'
+print 'normal diff:'
+sys.stdout.writelines(result)
+</code>output:<code>
+a =  [' abcd\n', 'abc pq\n', 'ef abc\n', 'mn\n']
+b =  ['abcd abcd\n', 'ef\n', 'mn\n']
+*******************************
+-  replace 0 3 0 2 :
+--from:
+ abcd
+abc pq
+ef abc
+--to:
+abcd abcd
+ef
+-  equal 3 4 2 3 :
+--from:
+mn
+--to:
+mn
+*******************************
+normal diff:
+-  abcd
++ abcd abcd
+? ++++
++ ef
+- abc pq
+- ef abc
+  mn
+</code>
+=== Match string with multilines ===
+<code python>
+import difflib
+import sys
 a = """ abcd
 abc pq
 ef abc
+mn
 """
 b = """abcd abcd
 ef
+mn
 """
 seq = difflib.SequenceMatcher(None, a, b)
 rate = seq.ratio() * 100
-print 'rate: ',rate
+print '*************************'
-print 'longest_match: ', seq.find_longest_match(0, 20, 0, 9)
+print 'rate1: ',rate
-print 'matching blocks:'
+print 'longest_match1: ', seq.find_longest_match(0, 20, 0, 9)
+print 'matching blocks1:'
 for block in seq.get_matching_blocks():
     print "a[%d] and b[%d] match for %d elements" % block
     print '>>>>', a[block[0]:(block[0] + block[2])]
-    print '<<<<', b[block[1]:(block[1] + block[2])]</code>output:<code>
+    print '<<<<', b[block[1]:(block[1] + block[2])]
-rate:  52.9411764706
-longest_match:  Match(a=0, b=4, size=5)
+a = a.splitlines(1)
-matching blocks:
+b = b.splitlines(1)
+seq2 = difflib.SequenceMatcher(None, a, b)
+rate = seq.ratio() * 100
+print '*************************'
+print 'rate2: ',rate
+print 'longest_match2: ', seq2.find_longest_match(0, 4, 0, 3)
+print 'matching blocks2:'
+for block in seq2.get_matching_blocks():
+    print "a[%d] and b[%d] match for %d elements" % block
+    print '>>>>', a[block[0]:(block[0] + block[2])]
+    print '<<<<', b[block[1]:(block[1] + block[2])]
+d = difflib.Differ()
+result = list(d.compare(a, b))
+print 'normal diff:'
+sys.stdout.writelines(result)</code>output:<code>
+*************************
+rate1:  60.0
+longest_match1:  Match(a=0, b=4, size=5)
+matching blocks1:
 a[0] and b[4] match for 5 elements
 >>>>  abcd
@@ Line 105: / Line 205: @@
 <<<<
 ef
-a[20] and b[12] match for 1 elements
+a[20] and b[12] match for 4 elements
 >>>>
+mn
 <<<<
+mn
-a[21] and b[13] match for 0 elements
+a[24] and b[16] match for 0 elements
 >>>>
 <<<<
+*************************
+rate2:  60.0
+longest_match2:  Match(a=3, b=2, size=1)
+matching blocks2:
+a[3] and b[2] match for 1 elements
+>>>> ['mn\n']
+<<<< ['mn\n']
+a[4] and b[3] match for 0 elements
+>>>> []
+<<<< []
+normal diff:
++ abcd abcd
++ ef
+-  abcd
+- abc pq
+- ef abc
+  mn
 </code>
 === SequenceMatcher with files ===
@@ Line 153: / Line 272: @@
 from pprint import pprint
 import sys
 a = """ abcd
 abc pq
 ef abc
+ mpq
 """.splitlines(1)
 b = """abcd abcd
 abc pq
 ef
+mpq
 """.splitlines(1)
 d = difflib.Differ()
 result = list(d.compare(a, b))
+print 'normal diff:'
 sys.stdout.writelines(result)
-</code> output: <code>
+print 'diff with charjunk = difflib.IS_CHARACTER_JUNK:'
+result = difflib.ndiff(a, b)
+sys.stdout.writelines(result)</code> output: <code>
 -  abcd
 + abcd abcd
@@ Line 183: / Line 308: @@
 htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html')
 with open(htmlfile1, 'r') as f:
-    doc1 = f.read().splitlines()
+    doc1 = f.read().splitlines(1)
 with open(htmlfile2, 'r') as f:
-    doc2 = f.read().splitlines()
+    doc2 = f.read().splitlines(1)
 d = difflib.Differ()
@@ Line 203: / Line 328: @@
 htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html')
 with open(htmlfile1, 'r') as f:
-    doc1 = f.read().splitlines()
+    doc1 = f.read().splitlines(1)
 with open(htmlfile2, 'r') as f:
-    doc2 = f.read().splitlines()
+    doc2 = f.read().splitlines(1)
 result = difflib.ndiff(doc1, doc2)
@@ Line 211: / Line 336: @@
     for line in result:
         f.writelines(line)
+</code>
+  * Compare 2 html files:<code python>
+import difflib
+from os import path
+from pprint import pprint
+import sys, re
+INPUT_DIR = 'opencart_47066'
+htmlfile1 = path.join(INPUT_DIR, 'index.html')
+htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html')
+with open(htmlfile1, 'r') as f:
+    content = f.read()
+    content = re.sub('[\s\t]+/>', '/>', content)
+    content = re.sub('[\s\t]+>', '>', content)
+    content = re.sub('>[\s\t]+<', '>\n<', content)
+    content = re.sub('[\s\t]*\n[\s\t]*', '\n', content)
+    doc1 = content.splitlines(1)
+with open(htmlfile2, 'r') as f:
+    content = f.read()
+    content = re.sub('[\s\t]+/>', '/>', content)
+    content = re.sub('[\s\t]+>', '>', content)
+    content = re.sub('>[\s\t]+<', '>\n<', content)
+    content = re.sub('[\s\t]*\n[\s\t]*', '\n', content)
+    doc2 = content.splitlines(1)
+result = difflib.ndiff(doc1, doc2)
+with open('compare.html', 'wb') as f:
+    for line in result:
+        f.writelines(line)
+</code>
+==== lxml.html.diff for comparing HTML files ====
+xml.html.diff using 2 basic libraries:
+  * difflib for comparing 2 files
+  * etree for parsing HTML
+Examples for lxml.html.diff:
+  * Simple diff:<code python>
+from os import path
+import sys, re
+from lxml.html import diff, etree, HTMLParser
+import codecs
+import StringIO
+doc1 = '''<div class="cart-button">
+<div class="cart">
+    <a title="Add to cart" data-id="35;" class="button addToCart-1 ">
+        <span>Add to cart</span>
+    </a>
+</div>
+<div class="wishlist">
+    <a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('35');">
+    <i class="icon-star"></i>
+    <span>Add to Wish List</span>
+    </a>
+    <b>simple</b>
+</div>
+</div>'''
+doc2 = '''<div class="cart-button">
+<div class="cart">
+    <a title="Add to cart" data-id="35;" class="button addToCart-1 ">
+        <span>Add to cart</span>
+    </a>
+</div>
+<div class="wishlist">
+    <a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('30');">
+    <i class="icon-star"></i>
+    <span>Add to Wish List change</span>
+    </a>
+</div>
+</div>'''
+diffcontent = diff.htmldiff(doc1, doc2)
+diffcontent = codecs.encode(diffcontent, 'utf-8')
+print diffcontent
+</code>output:<code html>
+<div class="cart-button"><div class="cart"><a title="Add to cart" data-id="35;" class="button addToCart-1 "><span>Add to cart</span> </a> </div> <div class="wishlist"><a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('30');"><i class="icon-star"></i> <span>Add to Wish List <ins>change</ins> </span> </a> <del><b>simple</b></del> </div> </div>
+</code>
+  * diff 2 HTML files:<code python>
+from os import path
+import sys, re
+from lxml.html import diff
+import codecs
+INPUT_DIR = 'opencart_47066'
+htmlfile1 = path.join(INPUT_DIR, 'index.html')
+htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html')
+with open(htmlfile1, 'r') as f:
+    content = f.read()
+    doc1 = content
+with open(htmlfile2, 'r') as f:
+    content = f.read()
+    doc2 = content
+diffcontent = diff.htmldiff(doc1, doc2)
+diffcontent = codecs.encode(diffcontent, 'utf-8')
+print diffcontent
 </code>
 ===== filecmp =====