|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +@author: 闲欢 |
| 5 | +""" |
| 6 | +from lxml import etree |
| 7 | + |
| 8 | +text = ''' |
| 9 | +<div> |
| 10 | + <ul id='ultest'> |
| 11 | + <li class="item-0"><a href="link1.html">first item</a></li> |
| 12 | + <li class="item-1"><a href="link2.html">second item</a></li> |
| 13 | + <li class="item-inactive"><a href="link3.html">third item</a></li> |
| 14 | + <li class="item-1"><a href="link4.html"><span>fourth item</span></a></li> |
| 15 | + <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签 |
| 16 | + </ul> |
| 17 | + </div> |
| 18 | +''' |
| 19 | + |
| 20 | +# 调用HTML类进行初始化,这样就成功构造了一个XPath解析对象。 |
| 21 | +page = etree.HTML(text) |
| 22 | +print(type(page)) |
| 23 | +print(etree.tostring(page)) |
| 24 | + |
| 25 | +# nodename |
| 26 | +print(page.xpath("ul")) |
| 27 | + |
| 28 | +# / |
| 29 | +print(page.xpath("/html")) |
| 30 | + |
| 31 | +# // |
| 32 | +print(page.xpath("//li")) |
| 33 | + |
| 34 | +# . |
| 35 | +ul = page.xpath("//ul") |
| 36 | +print(ul) |
| 37 | +print(ul[0].xpath(".")) |
| 38 | +print(ul[0].xpath("./li")) |
| 39 | + |
| 40 | +# .. |
| 41 | +print(ul[0].xpath("..")) |
| 42 | + |
| 43 | +# @ |
| 44 | +print(ul[0].xpath("@id")) |
| 45 | + |
| 46 | +# 谓语 |
| 47 | +# 第三个li标签 |
| 48 | +print(page.xpath('//ul/li[3]')) |
| 49 | +# 最后一个li标签 |
| 50 | +print(page.xpath('//ul/li[last()]')) |
| 51 | +# 倒数第二个li标签 |
| 52 | +print(page.xpath('//ul/li[last()-1]')) |
| 53 | +# 序号小于3的li标签 |
| 54 | +print(page.xpath('//ul/li[position()<3]')) |
| 55 | +# 有class属性的li标签 |
| 56 | +print(page.xpath('//li[@class]')) |
| 57 | +# class属性为item-inactive的li标签 |
| 58 | +print(page.xpath("//li[@class='item-inactive']")) |
| 59 | + |
| 60 | + |
| 61 | +# 获取文本 |
| 62 | +# text() |
| 63 | +print(page.xpath('//ul/li/a/text()')) |
| 64 | +# string() |
| 65 | +print(page.xpath('string(//ul)')) |
| 66 | + |
| 67 | +# 通配符 |
| 68 | +print(page.xpath('//li/*')) |
| 69 | +print(page.xpath('//li/@*')) |
| 70 | + |
| 71 | +# | |
| 72 | +print(page.xpath("//li|//a")) |
| 73 | + |
| 74 | +# 函数 |
| 75 | +# contains |
| 76 | +print(page.xpath("//*[contains(@class, 'item-inactive')]")) |
| 77 | + |
| 78 | +# starts-with |
| 79 | +print(page.xpath("//*[starts-with(@class, 'item-inactive')]")) |
| 80 | + |
| 81 | + |
| 82 | +# 节点轴 |
| 83 | +# ancestor轴 |
| 84 | +print(page.xpath('//li[1]/ancestor::*')) |
| 85 | +# attribute轴 |
| 86 | +print(page.xpath('//li[1]/attribute::*')) |
| 87 | +# child轴 |
| 88 | +print(page.xpath('//li[1]/child::a[@href="link1.html"]')) |
| 89 | +# descendant轴 |
| 90 | +print(page.xpath('//li[4]/descendant::span')) |
| 91 | +# following轴 |
| 92 | +print(page.xpath('//li[4]/following::*[2]')) |
| 93 | +# following-sibling轴 |
| 94 | +print(page.xpath('//li[4]/following-sibling::*')) |
| 95 | + |
| 96 | + |
0 commit comments