BeautifulSoup (2) 검색 메서드

BeautifulSoup (2) 검색 메서드

2018. 1. 18. 19:51ㆍPython/Programming

[ HTML ]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
<html>
  <head>
      <title>The Dormouse's story</title>
  </head>
  <body>
    <p class="title">
      <b>The Dormouse's story</b>
    </p>
    <p class="story">Once upon a time there were three little sisters; and their names were
      <a href="http://example.com/elsie" id="link1">Elsie</a>,
      <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
      <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
      and they lived at the bottom of a well.
    </p>
    <p class="story">
      ...
    </p>
  </body>
</html>
Colored by Color Scripter
cs

[ Python Code ]

=> 해당 html코드의 문서를 BeautifulSoup의 객체로써 soup변수에 저장한다

[ 검색 메서드 ]

tag.next_element / tag.next_elements

>>> soup.find(class_="story") # class명이 story인 p태그를 찾았다

Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.

>>> soup.find(class_="story").next_element # p태그의 다음 요소 하나

'Once upon a time there were three little sisters; and their names were\n'

>>> list(soup.find(class_="story").next_elements) # p태그의 다음 요소 모두

['Once upon a time there were three little sisters; and their names were\n',<a href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

'Lacie', ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', ';\nand they lived at the bottom of a well.', '\n', ..., '...', '\n']

.find() / .find_all()

find() => result or False 반환

find_all() => result or [] 반환

find_all() 함수는 정규식을 사용하여 검색할 수 있다 ( import re 필수 !)

>>> import re

>>> soup.find_all(["a","b"]) # 태그명이 a 와 b인 태그

[The Dormouse's story, <a href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.find_all(class_=re.compile("^s")) # class명이 s로 시작하는 태그

[ Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,

...]

>>> soup.find_all(re.compile("a")) # 태그명에 a를 포함하고 있는 태그 find_all("tagName")응용

[<head><title>The Dormouse's story</title></head>,

<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.find_all(re.compile("^a")) # 태그명이 a로 시작하는 태그

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

find_all

find_all( name, attrs, recursives, string, limit, **kwargs )

[1] name ( find_all("tagName") )

>>> soup.find_all("a")

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

[2] attrs

>>> soup.find_all(id="link2") # id value=link2 인 태그 모두

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

>>> soup.find_all(class_="sister") # class value=sister인 태그 모두 ( class 가 아니라 class_ 로 사용 )

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.find_all(href=re.compile("elsie")) # href value에 elsie를 포함하는 태그 모두

[<a href="http://example.com/elsie" id="link1">Elsie</a>]

>>> soup.find_all("a",attrs={ "class" : "sister" }) # "a" 태그중에서 class="sister" 가 존재하는 모든 태그

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.find_all(attrs={ "class" : "title" }) # class="title" 이 존재하는 모든 태그

[The Dormouse's story]

[3] string

>>> soup.find_all("a",string="Elsie") # a태그 중 해당 태그의 텍스트에 "Elsie" 문자가 존재하는 a태그

[<a href="http://example.com/elsie" id="link1">Elsie</a>]

>>> soup.find_all(string=["Tillie", "Elsie", "Lacie"]) # 모든 태그 중 해당 태그의 텍스트에 Tille or Elsie or Lacie 가 존재

['Elsie', 'Lacie', 'Tillie']

>>> soup.find_all(string=re.compile("Dor")) # 모든 태그 중 해당 태그의 텍스트에 "Dor" 문자를 포함하고 있는 태그

["The Dormouse's story", "The Dormouse's story"]

>>> soup.find_all(string=re.compile("^O")) # 모든 태그 중 해당 태그의 텍스트가 "O" 문자로 시작하는 태그

['Once upon a time there were three little sisters; and their names were\n']

[4] limit

>>> soup.find_all("a") # 존재하는 a태그 모두 반환

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.find_all("a",limit=2) # a태그 검색결과 3개중 2개만 반환

[<a href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

[5] recursive

>>> soup.find_all("title") # recursive=True ( 기본값 )

[<title>The Dormouse's story</title>]

>>> soup.find_all("title",recursive=False) # recursive=False ( 후손이 아닌 직계 자손만 검색 )

[]

find_parent & find_parents

find_parent( name, attrs, recursives, string, limit, **kwargs )

>>> soup.find("a",string="Elsie")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

>>> soup.find("a",string="Elsie").find_parent() # a태그의 직계 부모 태그 검색 (한 개)

Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.

>>> soup.find("a",string="Elsie").find_parents() # 해당태그의 부모태그 모두검색

[(p태그)Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.,

(body태그)<body>

.... (생략)

</body>,

(html태그)<html><head><title>The Dormouse's story</title></head>

<body>

...(생략)

</body></html>,

(DOC문서)<html><head><title>The Dormouse's story</title></head>

<body>

...(생략)

</body></html>]

find_next_sibling() & find_next_siblins()

find_next_sibling( name, attrs, recursives, string, limit, **kwargs )

>>> soup.find("a")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

>>> soup.find("a").find_next_sibling() # 해당 태그의 형제 태그 하나 검색

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

>>> soup.find("a").find_next_siblings() # 해당 태그의 형제 태그 모두 검색

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

find_next_sibling() : 해당 태그를 기준으로 아래 방향으로 태그를 검색

<-> find_previous_sibling() : 해당 태그를 기준으로 윗 방향으로 태그를 검색

find_all_next()

tag.next_element 반복사용

find_all_previous()

tag.previous_element 반복사용

[ CSS Selector ]

select = find_all()

>>> soup.select("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

* nth-of-type(index) : 여러개의 태그 중에서 1,2,3 ... 번째 태그를 가져올 수 있다

>>> soup.select("a:nth-of-type(1)")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

>>> soup.select("a:nth-of-type(2)")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

>>> soup.select("a:nth-of-type(3)")

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# id

>>> soup.select("#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# class

>>> soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 자식

>>> soup.select("html > head > title") # <html><head><title> ...

[<title>The Dormouse's story</title>]

>>> soup.select("body > p > #link1") # <body><a id="link1">

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 후손

>>> soup.select("html body a") # <html><body><a>

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.select("html body a:nth-of-type(1)")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 형제

>>> soup.select("#link1 ~ .sister") # id= link1인 태그와 형제관계 중 sister클래스인 모든 태그

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.select("#link1 + .sister") # id= link1인 태그와 형제관계 중 sister클래스인 첫번째 태그

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

>>> soup.select("#link1 + .sister + .sister") # id= link1인 태그와 형제관계 중 sister클래스인 첫번째 태그의 다음형제

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

정규식 활용

>>> soup.select("[class~=sister]") # class속성값에 "sister"를 포함하는 태그

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.select("a[href]") # a 태그 중, href 속성이 존재하는 태그

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.select('a[href = "http://example.com/elsie"]') # a 태그 중, 특정값이 href속성값에 일치하는 태그

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

>>> soup.select('a[href ^= "http://example.com"]') # a 태그 중, href속성값의 시작 문자열

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.select('a[href $= "lacie"]') # a태그 중, href속성값의 마지막 문자열

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

>>> soup.select('a[href *= "el"]') # a태그 중, href속성값에 특정 문자열이 존재하는지

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

[ Output ]

.strings() & .stripped_strings()

>>> for txt in soup.strings:

print(txt)

\n\n\n

The Dormouse's story

\n\n\n\n

The Dormouse's story

\n\n

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

;

and they lived at the bottom of a well.

\n\n

...\n\n

해당 문서에서 \n(white space)가 많이 존재할 경우 stripped_string()함수를 사용하면 제거할 수 있다

>>> for txt in soup.stripped_strings:

print(txt)

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie

Lacie

and

Tillie

;

and they lived at the bottom of a well.

...

* stripped_strings 응용 ( 모든 텍스트들을 배열로 옮길 수 있다 )

>>> [text for text in soup.stripped_strings]

["The Dormouse's story", "The Dormouse's story", 'Once upon a time there were three little sisters; and their names were', 'Elsie', ',', 'Lacie', 'and', 'Tillie', ';\nand they lived at the bottom of a well.', '...']

.prettify()

>>> soup.prettify() # html형식을 살려서 옮겨 올 수 있다

'<html>\n <head>\n <title>\n The Dormouse\'s story\n </title>\n </head>\n <body>\n \n \n The Dormouse\'s story\n \n \n \n Once upon a time there were three little sisters; and their names were\n <a class="sister" href="http://example.com/elsie" id="link1">\n Elsie\n </a>\n ,\n <a class="sister" href="http://example.com/lacie" id="link2">\n Lacie\n </a>\n and\n <a class="sister" href="http://example.com/tillie" id="link3">\n Tillie\n </a>\n ;\nand they lived at the bottom of a well.\n \n \n ...\n \n </body>\n</html>'

* html형식을 살려서 출력할 수 있다

print(soup.prettify(formatter="html"))
# <html>
#  <body>
#   <p>
#    Il a dit &lt;&lt;Sacr&eacute; bleu!&gt;&gt;
#   </p>
#  </body>
# </html>

link_soup = BeautifulSoup('<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
print(link_soup.a.encode(formatter=None))
# <a href="http://example.com/?foo=val1&bar=val2">A link</a>

[ Format ]

encode() & decode() , str()

>>> soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

>>> type(soup.a)

<class 'bs4.element.Tag'

>>> soup.a.encode()

b'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'

>>> type(soup.a.encode())

>>>> str(soup.a)

'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'

>>> type(str(soup.a))

>>> soup.a.decode()

'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'

>>> type(soup.a.decode())

.get_text()

>>> soup.get_text("★")

"\n★The Dormouse's story★\n★\n★The Dormouse's story★\n★Once upon a time there were three little sisters; and their names were\n★Elsie★,\n★Lacie★ and\n★Tillie★;\nand they lived at the bottom of a well.★\n★...★\n"

>>> soup.get_text("★",strip=True) # 텍스트들 사이의 \n 제거

"The Dormouse's story★The Dormouse's story★Once upon a time there were three little sisters; and their names were★Elsie★,★Lacie★and★Tillie★;\nand they lived at the bottom of a well.★..."

[ Encoding ]

>>> markup = b'''

<html>

<head>

</head>

<body>

Sacr\xe9 bleu!

</body>

</html>

'''

>>> soup3 = BeautifulSoup(markup)

>>> soup3.prettify("utf-8")

b'<html>\n <head>\n <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>\n </head>\n <body>\n \n Sacr\xc3\xa9 bleu!\n \n </body>\n</html>\n'

>>> soup3.prettify("latin-1")

b'<html>\n <head>\n <meta content="text/html; charset=latin-1" http-equiv="Content-type"/>\n </head>\n <body>\n \n Sacr\xe9 bleu!\n \n </body>\n</html>\n'

>>> soup3.prettify("ascii")

b'<html>\n <head>\n <meta content="text/html; charset=ascii" http-equiv="Content-type"/>\n </head>\n <body>\n \n Sacré bleu!\n \n </body>\n</html>\n'

저작자표시 비영리 변경금지 (새창열림)

'Python > Programming' 카테고리의 다른 글

BeautifulSoup 예제2 음원사이트 Genie 차트 순위 긁어오기 (0)	2018.01.24
Python을 이용한 이미지 다운로드 ( urlretrieve ) (1)	2018.01.24
BeautifulSoup 예제1 네이버 실시간 검색어 긁어오기 (0)	2018.01.24
BeatifulSoup (1) 기본 메서드 (0)	2018.01.16
Python HTTP Request & Response 실습( urllib module ) (0)	2018.01.16

WHEH 정보보안

WHEH 정보보안

태그

최근글

댓글

공지사항

아카이브

'Python > Programming' 카테고리의 다른 글

관련글

티스토리툴바