User Tools

Site Tools


python:compare

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
Last revisionBoth sides next revision
python:compare [2014/07/21 03:45] – [longest match ratio] adminpython:compare [2014/07/22 14:10] – [lxml.html.diff for comparing HTML files] admin
Line 7: Line 7:
   * **find_longest_match**   * **find_longest_match**
   * **get_matching_blocks**   * **get_matching_blocks**
-The function **get_opcodes** using above these functions for parsing+The function **get_opcodes** using above these functions for parsing\\ 
 +Create SequenceMatcher with input are two **strings or two lists**
 === match ratio === === match ratio ===
   * Calculate match ratio of two strings:<code python>   * Calculate match ratio of two strings:<code python>
Line 54: Line 55:
 seq = difflib.SequenceMatcher(None, a, b) seq = difflib.SequenceMatcher(None, a, b)
 seq2 = difflib.SequenceMatcher(lambda x: x==" ", a, b) seq2 = difflib.SequenceMatcher(lambda x: x==" ", a, b)
 +seq3 = difflib.SequenceMatcher(difflib.IS_LINE_JUNK, a, b)
  
 print seq.find_longest_match(0, 5, 0, 9) print seq.find_longest_match(0, 5, 0, 9)
 print seq2.find_longest_match(0, 5, 0, 9) print seq2.find_longest_match(0, 5, 0, 9)
 +print seq3.find_longest_match(0, 5, 0, 9)
 </code>output:<code> </code>output:<code>
 Match(a=0, b=4, size=5) Match(a=0, b=4, size=5)
 +Match(a=1, b=0, size=4)
 Match(a=1, b=0, size=4) Match(a=1, b=0, size=4)
 </code> </code>
Line 88: Line 92:
 a[9] and b[9] match for 0 elements a[9] and b[9] match for 0 elements
 </code> **a[0] and b[4] match for 5 elements:** 5 elements from a[0] are ' abcd' and 5 elements from b[9] are ' abcd' </code> **a[0] and b[4] match for 5 elements:** 5 elements from a[0] are ' abcd' and 5 elements from b[9] are ' abcd'
-=== Math string with multilines ===+=== get_opcodes ===
 <code python> <code python>
 import difflib import difflib
 +import sys
 + 
 +a = """ abcd
 +abc pq
 +ef abc
 +mn
 +""".splitlines(1)
 +b = """abcd abcd
 +ef
 +mn
 +""".splitlines(1)
 +print 'a = ', a
 +print 'b = ', b
 +seq = difflib.SequenceMatcher(None, a, b)
 +print '*******************************'
 +for tag, alo, ahi, blo, bhi in seq.get_opcodes():
 +    print '- ', tag, alo, ahi, blo, bhi, ':'
 +    print '--from:'
 +    for i in range(alo, ahi):
 +        sys.stdout.writelines(a[i])
 +    print '--to:'
 +    for i in range(blo, bhi):
 +        sys.stdout.writelines(b[i])
 +result = list(difflib.ndiff(a, b))
 +print '*******************************'
 +print 'normal diff:'
 +sys.stdout.writelines(result)
 +</code>output:<code>
 +a =  [' abcd\n', 'abc pq\n', 'ef abc\n', 'mn\n']
 +b =  ['abcd abcd\n', 'ef\n', 'mn\n']
 +*******************************
 +-  replace 0 3 0 2 :
 +--from:
 + abcd
 +abc pq
 +ef abc
 +--to:
 +abcd abcd
 +ef
 +-  equal 3 4 2 3 :
 +--from:
 +mn
 +--to:
 +mn
 +*******************************
 +normal diff:
 +-  abcd
 ++ abcd abcd
 +? ++++
 ++ ef
 +- abc pq
 +- ef abc
 +  mn
 +</code>
 +=== Match string with multilines ===
 +<code python>
 +import difflib
 +import sys
    
 a = """ abcd  a = """ abcd 
 abc pq abc pq
 ef abc ef abc
 +mn
 """ """
 b = """abcd abcd b = """abcd abcd
 ef ef
 +mn
 """ """
    
 seq = difflib.SequenceMatcher(None, a, b) seq = difflib.SequenceMatcher(None, a, b)
 rate = seq.ratio() * 100 rate = seq.ratio() * 100
-print 'rate: ',rate +print '*************************' 
-print 'longest_match: ', seq.find_longest_match(0, 20, 0, 9) +print 'rate1: ',rate 
-print 'matching blocks:'+print 'longest_match1: ', seq.find_longest_match(0, 20, 0, 9) 
 +print 'matching blocks1:'
 for block in seq.get_matching_blocks(): for block in seq.get_matching_blocks():
     print "a[%d] and b[%d] match for %d elements" % block     print "a[%d] and b[%d] match for %d elements" % block
     print '>>>>', a[block[0]:(block[0] + block[2])]     print '>>>>', a[block[0]:(block[0] + block[2])]
-    print '<<<<', b[block[1]:(block[1] + block[2])]</code>output:<code> +    print '<<<<', b[block[1]:(block[1] + block[2])] 
-rate:  52.9411764706 + 
-longest_match:  Match(a=0, b=4, size=5) +a = a.splitlines(1) 
-matching blocks:+b = b.splitlines(1) 
 +seq2 = difflib.SequenceMatcher(None, a, b) 
 +rate = seq.ratio() * 100 
 +print '*************************' 
 +print 'rate2: ',rate 
 +print 'longest_match2: ', seq2.find_longest_match(0, 4, 0, 3) 
 +print 'matching blocks2:' 
 +for block in seq2.get_matching_blocks(): 
 +    print "a[%d] and b[%d] match for %d elements" % block 
 +    print '>>>>', a[block[0]:(block[0] + block[2])] 
 +    print '<<<<', b[block[1]:(block[1] + block[2])] 
 + 
 + 
 +d = difflib.Differ() 
 +result = list(d.compare(a, b)) 
 +print 'normal diff:' 
 +sys.stdout.writelines(result)</code>output:<code> 
 +************************* 
 +rate1:  60.0 
 +longest_match1:  Match(a=0, b=4, size=5) 
 +matching blocks1:
 a[0] and b[4] match for 5 elements a[0] and b[4] match for 5 elements
 >>>>  abcd >>>>  abcd
Line 120: Line 205:
 <<<< <<<<
 ef ef
-a[20] and b[12] match for elements+a[20] and b[12] match for elements
 >>>> >>>>
 +mn
  
 <<<< <<<<
 +mn
  
-a[21] and b[13] match for 0 elements+a[24] and b[16] match for 0 elements
 >>>> >>>>
 <<<< <<<<
 +*************************
 +rate2:  60.0
 +longest_match2:  Match(a=3, b=2, size=1)
 +matching blocks2:
 +a[3] and b[2] match for 1 elements
 +>>>> ['mn\n']
 +<<<< ['mn\n']
 +a[4] and b[3] match for 0 elements
 +>>>> []
 +<<<< []
 +normal diff:
 ++ abcd abcd
 ++ ef
 +-  abcd
 +- abc pq
 +- ef abc
 +  mn
 </code> </code>
 === SequenceMatcher with files === === SequenceMatcher with files ===
Line 232: Line 336:
     for line in result:     for line in result:
         f.writelines(line)         f.writelines(line)
 +</code>
 +  * Compare 2 html files:<code python>
 +import difflib
 +from os import path
 +from pprint import pprint
 +import sys, re 
 + 
 +INPUT_DIR = 'opencart_47066'
 +htmlfile1 = path.join(INPUT_DIR, 'index.html')
 +htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html')
 +with open(htmlfile1, 'r') as f:
 +    content = f.read()
 +    content = re.sub('[\s\t]+/>', '/>', content)
 +    content = re.sub('[\s\t]+>', '>', content)
 +    content = re.sub('>[\s\t]+<', '>\n<', content)
 +    content = re.sub('[\s\t]*\n[\s\t]*', '\n', content)
 +    doc1 = content.splitlines(1)
 +with open(htmlfile2, 'r') as f:
 +    content = f.read()
 +    content = re.sub('[\s\t]+/>', '/>', content)
 +    content = re.sub('[\s\t]+>', '>', content)
 +    content = re.sub('>[\s\t]+<', '>\n<', content)
 +    content = re.sub('[\s\t]*\n[\s\t]*', '\n', content)
 +    doc2 = content.splitlines(1)
 +
 +result = difflib.ndiff(doc1, doc2)
 +with open('compare.html', 'wb') as f:
 +    for line in result:
 +        f.writelines(line)
 +</code>
 +==== lxml.html.diff for comparing HTML files ====
 +xml.html.diff using 2 basic libraries:
 +  * difflib for comparing 2 files
 +  * etree for parsing HTML
 +Examples for lxml.html.diff:
 +  * Simple diff:<code python>
 +from os import path
 +import sys, re
 +from lxml.html import diff, etree, HTMLParser
 +import codecs
 +import StringIO
 +doc1 = '''<div class="cart-button">
 +<div class="cart">
 +    <a title="Add to cart" data-id="35;" class="button addToCart-1 "> 
 +        <span>Add to cart</span>
 +    </a>
 +</div> 
 +<div class="wishlist">
 +    <a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('35');">
 +    <i class="icon-star"></i>
 +    <span>Add to Wish List</span>
 +    </a>
 +    <b>simple</b>
 +</div>
 +</div>'''
 +doc2 = '''<div class="cart-button">
 +<div class="cart">
 +    <a title="Add to cart" data-id="35;" class="button addToCart-1 "> 
 +        <span>Add to cart</span>
 +    </a>
 +</div> 
 +<div class="wishlist">
 +    <a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('30');">
 +    <i class="icon-star"></i>
 +    <span>Add to Wish List change</span>
 +    </a>
 +</div>
 +</div>'''
 +diffcontent = diff.htmldiff(doc1, doc2)
 +diffcontent = codecs.encode(diffcontent, 'utf-8')
 +print diffcontent
 +</code>output:<code html>
 +<div class="cart-button"><div class="cart"><a title="Add to cart" data-id="35;" class="button addToCart-1 "><span>Add to cart</span> </a> </div> <div class="wishlist"><a class="tooltip-1" title="Add to Wish List" onclick="addToWishList('30');"><i class="icon-star"></i> <span>Add to Wish List <ins>change</ins> </span> </a> <del><b>simple</b></del> </div> </div>
 +</code>
 +  * diff 2 HTML files:<code python>
 +from os import path
 +import sys, re
 +from lxml.html import diff
 +import codecs
 + 
 +INPUT_DIR = 'opencart_47066'
 +htmlfile1 = path.join(INPUT_DIR, 'index.html')
 +htmlfile2 = path.join(INPUT_DIR, 'index.php@route=account%2Flogin.html')
 +with open(htmlfile1, 'r') as f:
 +    content = f.read()
 +    doc1 = content
 +with open(htmlfile2, 'r') as f:
 +    content = f.read()
 +    doc2 = content
 +diffcontent = diff.htmldiff(doc1, doc2)
 +diffcontent = codecs.encode(diffcontent, 'utf-8')
 +print diffcontent 
 </code> </code>
 ===== filecmp ===== ===== filecmp =====
python/compare.txt · Last modified: 2022/10/29 16:15 by 127.0.0.1