BeautifulSoup (2) 검색 메서드

2018. 1. 18. 19:51Python/Programming



[ HTML ]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
<html>
  <head>
      <title>The Dormouse's story</title>
  </head>
  <body>
    <p class="title">
      <b>The Dormouse's story</b>
    </p>
    <p class="story">Once upon a time there were three little sisters; and their names were
      <a href="http://example.com/elsie" id="link1">Elsie</a>,
      <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
      <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
      and they lived at the bottom of a well.
    </p>
    <p class="story">
      ...
    </p>
  </body>
</html>
cs


[ Python Code ]


=> 해당 html코드의 문서를 BeautifulSoup의 객체로써 soup변수에 저장한다



[ 검색 메서드 ]

tag.next_element / tag.next_elements

>>> soup.find(class_="story")                # class명이 story인 p태그를 찾았다

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>


>>> soup.find(class_="story").next_element            # p태그의 다음 요소 하나

'Once upon a time there were three little sisters; and their names were\n'


>>> list(soup.find(class_="story").next_elements)            # p태그의 다음 요소 모두

['Once upon a time there were three little sisters; and their names were\n',<a href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

'Lacie', ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', ';\nand they lived at the bottom of a well.', '\n', <p class="story">...</p>, '...', '\n']


.find() / .find_all()

find() => result or False 반환

find_all() => result or [] 반환

find_all() 함수는 정규식을 사용하여 검색할 수 있다 ( import re 필수 !)


>>> import re

>>> soup.find_all(["a","b"])                # 태그명이 a 와 b인 태그

[<b>The Dormouse's story</b>, <a href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.find_all(class_=re.compile("^s"))    # class명이 s로 시작하는 태그

[ <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>, 

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, 

 <p class="story">...</p>]


>>> soup.find_all(re.compile("a"))            # 태그명에 a를 포함하고 있는 태그 find_all("tagName")응용

[<head><title>The Dormouse's story</title></head>,

 <a href="http://example.com/elsie" id="link1">Elsie</a>, 

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.find_all(re.compile("^a"))        # 태그명이 a로 시작하는 태그

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


find_all

find_all( name, attrs, recursives, string, limit, **kwargs )

[1] name ( find_all("tagName") )

>>> soup.find_all("a")

[<a href="http://example.com/elsie" id="link1">Elsie</a>, 

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


[2] attrs

>>> soup.find_all(id="link2")            # id value=link2 인 태그 모두

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


>>> soup.find_all(class_="sister")       # class value=sister인 태그 모두 ( class 가 아니라 class_ 로 사용 )

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.find_all(href=re.compile("elsie"))            # href value에 elsie를 포함하는 태그 모두

[<a href="http://example.com/elsie" id="link1">Elsie</a>]


>>> soup.find_all("a",attrs={ "class" : "sister" })    # "a" 태그중에서 class="sister" 존재하는 모든 태그

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.find_all(attrs={ "class" : "title" })            # class="title" 이 존재하는 모든 태그

[<p class="title"><b>The Dormouse's story</b></p>]


[3] string

>>> soup.find_all("a",string="Elsie")            # a태그 중 해당 태그의 텍스트에 "Elsie" 문자가 존재하는 a태그

[<a href="http://example.com/elsie" id="link1">Elsie</a>]


>>> soup.find_all(string=["Tillie", "Elsie", "Lacie"])            # 모든 태그 중 해당 태그의 텍스트에 Tille or Elsie or Lacie 가 존재

['Elsie', 'Lacie', 'Tillie']


>>> soup.find_all(string=re.compile("Dor"))            # 모든 태그 중 해당 태그의 텍스트에 "Dor" 문자를 포함하고 있는 태그

["The Dormouse's story", "The Dormouse's story"]


>>> soup.find_all(string=re.compile("^O"))            # 모든 태그 중 해당 태그의 텍스트가 "O" 문자로 시작하는 태그

['Once upon a time there were three little sisters; and their names were\n']


[4] limit

>>> soup.find_all("a")            # 존재하는 a태그 모두 반환                           

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.find_all("a",limit=2)           # a태그 검색결과 3개중 2개만 반환

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


[5] recursive

>>> soup.find_all("title")            # recursive=True ( 기본값 )

[<title>The Dormouse's story</title>]


>>> soup.find_all("title",recursive=False)            # recursive=False ( 후손이 아닌 직계 자손만 검색 )

[]



find_parent & find_parents

find_parent( name, attrs, recursives, string, limit, **kwargs )

>>> soup.find("a",string="Elsie")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

>>> soup.find("a",string="Elsie").find_parent()                # a태그의 직계 부모 태그 검색 (한 개)              

<p class="story">Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.

</p>


>>> soup.find("a",string="Elsie").find_parents()            # 해당태그의 부모태그 모두검색

[(p태그)<p class="story">Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>, 

(body태그)<body>

.... (생략)

</body>,

(html태그)<html><head><title>The Dormouse's story</title></head>

<body>

...(생략)

</body></html>, 

(DOC문서)<html><head><title>The Dormouse's story</title></head>

<body>

...(생략)

</body></html>]


find_next_sibling() & find_next_siblins()

find_next_sibling( name, attrs, recursives, string, limit, **kwargs )

>>> soup.find("a")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


>>> soup.find("a").find_next_sibling()            # 해당 태그의 형제 태그 하나 검색

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>


>>> soup.find("a").find_next_siblings()            # 해당 태그의 형제 태그 모두 검색

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


find_next_sibling() : 해당 태그를 기준으로 아래 방향으로 태그를 검색

<-> find_previous_sibling() : 해당 태그를 기준으로 윗 방향으로 태그를 검색


find_all_next()

tag.next_element 반복사용


find_all_previous()

tag.previous_element 반복사용



[ CSS Selector ]

select = find_all()

>>> soup.select("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


* nth-of-type(index)    : 여러개의 태그 중에서 1,2,3 ... 번째 태그를 가져올 수 있다

>>> soup.select("a:nth-of-type(1)")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


>>> soup.select("a:nth-of-type(2)")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


>>> soup.select("a:nth-of-type(3)")

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


# id 

>>> soup.select("#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


# class

>>> soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


# 자식

>>> soup.select("html > head > title")            # <html><head><title> ...

[<title>The Dormouse's story</title>]


>>> soup.select("body > p > #link1")            # <body><p><a id="link1">

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


# 후손


>>> soup.select("html body a")             # <html><body><p><a>

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.select("html body a:nth-of-type(1)")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


# 형제

>>> soup.select("#link1 ~ .sister")            # id= link1인 태그와 형제관계 중 sister클래스인 모든 태그

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.select("#link1 + .sister")            # id= link1인 태그와 형제관계 중 sister클래스인 첫번째 태그

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


>>> soup.select("#link1 + .sister + .sister")            # id= link1인 태그와 형제관계 중 sister클래스인 첫번째 태그의 다음형제

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


정규식 활용

>>> soup.select("[class~=sister]")            # class속성값에 "sister"를 포함하는 태그

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.select("a[href]")            # a 태그 중, href 속성이 존재하는 태그

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.select('a[href "http://example.com/elsie"]')            # a 태그 중, 특정값이 href속성값에 일치하는 태그 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


>>> soup.select('a[href ^= "http://example.com"]')            # a 태그 중, href속성값의 시작 문자열

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


>>> soup.select('a[href $= "lacie"]')            # a태그 중, href속성값의 마지막 문자열

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


>>> soup.select('a[href *= "el"]')            # a태그 중, href속성값에 특정 문자열이 존재하는지

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]




[ Output ]

.strings() & .stripped_strings()

>>> for txt in soup.strings:

print(txt)


\n\n\n

The Dormouse's story

\n\n\n\n

The Dormouse's story

\n\n

Once upon a time there were three little sisters; and their names were

\n

Elsie

,

\n

Lacie

 and

\n

Tillie

;

and they lived at the bottom of a well.

\n\n

...\n\n


해당 문서에서 \n(white space)가 많이 존재할 경우 stripped_string()함수를 사용하면 제거할 수 있다


>>> for txt in soup.stripped_strings:

print(txt)


The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

,

Lacie

and

Tillie

;

and they lived at the bottom of a well.

...


* stripped_strings 응용 ( 모든 텍스트들을 배열로 옮길 수 있다 )

>>> [text for text in soup.stripped_strings]

["The Dormouse's story", "The Dormouse's story", 'Once upon a time there were three little sisters; and their names were', 'Elsie', ',', 'Lacie', 'and', 'Tillie', ';\nand they lived at the bottom of a well.', '...']


.prettify()

>>> soup.prettify()            # html형식을 살려서 옮겨 올 수 있다

'<html>\n <head>\n  <title>\n   The Dormouse\'s story\n  </title>\n </head>\n <body>\n  <p class="title">\n   <b>\n    The Dormouse\'s story\n   </b>\n  </p>\n  <p class="story">\n   Once upon a time there were three little sisters; and their names were\n   <a class="sister" href="http://example.com/elsie" id="link1">\n    Elsie\n   </a>\n   ,\n   <a class="sister" href="http://example.com/lacie" id="link2">\n    Lacie\n   </a>\n   and\n   <a class="sister" href="http://example.com/tillie" id="link3">\n    Tillie\n   </a>\n   ;\nand they lived at the bottom of a well.\n  </p>\n  <p class="story">\n   ...\n  </p>\n </body>\n</html>'


* html형식을 살려서 출력할 수 있다

print(soup.prettify(formatter="html"))
# <html>
#  <body>
#   <p>
#    Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
#   </p>
#  </body>
# </html>

link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>') print(link_soup.a.encode(formatter=None)) # <a href="http://example.com/?foo=val1&bar=val2">A link</a>



[ Format ]

encode() & decode() , str()

>>> soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

>>> type(soup.a)

<class 'bs4.element.Tag'


>>> soup.a.encode()

b'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'

>>> type(soup.a.encode())

<class 'bytes'>


>>>> str(soup.a)

'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'

>>> type(str(soup.a))

<class 'str'>


>>> soup.a.decode()

'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'

>>> type(soup.a.decode())

<class 'str'>


.get_text()

>>> soup.get_text("★")

"\n★The Dormouse's story★\n★\n★The Dormouse's story★\n★Once upon a time there were three little sisters; and their names were\n★Elsie★,\n★Lacie★ and\n★Tillie★;\nand they lived at the bottom of a well.★\n★...★\n"

>>> soup.get_text("★",strip=True)            # 텍스트들 사이의 \n 제거

"The Dormouse's story★The Dormouse's story★Once upon a time there were three little sisters; and their names were★Elsie★,★Lacie★and★Tillie★;\nand they lived at the bottom of a well.★..."



[ Encoding ]

>>> markup = b'''

 <html>

  <head>

   <meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" />

  </head>

  <body>

   <p>Sacr\xe9 bleu!</p>

  </body>

 </html>

'''

>>> soup3 = BeautifulSoup(markup)

>>> soup3.prettify("utf-8")

b'<html>\n <head>\n  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>\n </head>\n <body>\n  <p>\n   Sacr\xc3\xa9 bleu!\n  </p>\n </body>\n</html>\n'


>>> soup3.prettify("latin-1")

b'<html>\n <head>\n  <meta content="text/html; charset=latin-1" http-equiv="Content-type"/>\n </head>\n <body>\n  <p>\n   Sacr\xe9 bleu!\n  </p>\n </body>\n</html>\n'


>>> soup3.prettify("ascii")

b'<html>\n <head>\n  <meta content="text/html; charset=ascii" http-equiv="Content-type"/>\n </head>\n <body>\n  <p>\n   Sacr&#233; bleu!\n  </p>\n </body>\n</html>\n'