2018. 1. 18. 19:51ㆍPython/Programming
[ HTML ]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | <html> <head> <title>The Dormouse's story</title> </head> <body> <p class="title"> <b>The Dormouse's story</b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story"> ... </p> </body> </html> | cs |
[ Python Code ]
=> 해당 html코드의 문서를 BeautifulSoup의 객체로써 soup변수에 저장한다
[ 검색 메서드 ]
tag.next_element / tag.next_elements
>>> soup.find(class_="story") # class명이 story인 p태그를 찾았다
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
>>> soup.find(class_="story").next_element # p태그의 다음 요소 하나
'Once upon a time there were three little sisters; and their names were\n'
>>> list(soup.find(class_="story").next_elements) # p태그의 다음 요소 모두
['Once upon a time there were three little sisters; and their names were\n',<a href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
'Lacie', ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', ';\nand they lived at the bottom of a well.', '\n', <p class="story">...</p>, '...', '\n']
.find() / .find_all()
find() => result or False 반환
find_all() => result or [] 반환
find_all() 함수는 정규식을 사용하여 검색할 수 있다 ( import re 필수 !)
>>> import re
>>> soup.find_all(["a","b"]) # 태그명이 a 와 b인 태그
[<b>The Dormouse's story</b>, <a href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.find_all(class_=re.compile("^s")) # class명이 s로 시작하는 태그
[ <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
<p class="story">...</p>]
>>> soup.find_all(re.compile("a")) # 태그명에 a를 포함하고 있는 태그 find_all("tagName")응용
[<head><title>The Dormouse's story</title></head>,
<a href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.find_all(re.compile("^a")) # 태그명이 a로 시작하는 태그
[<a href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
find_all
find_all( name, attrs, recursives, string, limit, **kwargs )
[1] name ( find_all("tagName") )
>>> soup.find_all("a")
[<a href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[2] attrs
>>> soup.find_all(id="link2") # id value=link2 인 태그 모두
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
>>> soup.find_all(class_="sister") # class value=sister인 태그 모두 ( class 가 아니라 class_ 로 사용 )
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.find_all(href=re.compile("elsie")) # href value에 elsie를 포함하는 태그 모두
[<a href="http://example.com/elsie" id="link1">Elsie</a>]
>>> soup.find_all("a",attrs={ "class" : "sister" }) # "a" 태그중에서 class="sister" 가 존재하는 모든 태그
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.find_all(attrs={ "class" : "title" }) # class="title" 이 존재하는 모든 태그
[<p class="title"><b>The Dormouse's story</b></p>]
[3] string
>>> soup.find_all("a",string="Elsie") # a태그 중 해당 태그의 텍스트에 "Elsie" 문자가 존재하는 a태그
[<a href="http://example.com/elsie" id="link1">Elsie</a>]
>>> soup.find_all(string=["Tillie", "Elsie", "Lacie"]) # 모든 태그 중 해당 태그의 텍스트에 Tille or Elsie or Lacie 가 존재
['Elsie', 'Lacie', 'Tillie']
>>> soup.find_all(string=re.compile("Dor")) # 모든 태그 중 해당 태그의 텍스트에 "Dor" 문자를 포함하고 있는 태그
["The Dormouse's story", "The Dormouse's story"]
>>> soup.find_all(string=re.compile("^O")) # 모든 태그 중 해당 태그의 텍스트가 "O" 문자로 시작하는 태그
['Once upon a time there were three little sisters; and their names were\n']
[4] limit
>>> soup.find_all("a") # 존재하는 a태그 모두 반환
[<a href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.find_all("a",limit=2) # a태그 검색결과 3개중 2개만 반환
[<a href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
[5] recursive
>>> soup.find_all("title") # recursive=True ( 기본값 )
[<title>The Dormouse's story</title>]
>>> soup.find_all("title",recursive=False) # recursive=False ( 후손이 아닌 직계 자손만 검색 )
[]
find_parent & find_parents
find_parent( name, attrs, recursives, string, limit, **kwargs )
>>> soup.find("a",string="Elsie")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.find("a",string="Elsie").find_parent() # a태그의 직계 부모 태그 검색 (한 개)
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.
</p>
>>> soup.find("a",string="Elsie").find_parents() # 해당태그의 부모태그 모두검색
[(p태그)<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>,
(body태그)<body>
.... (생략)
</body>,
(html태그)<html><head><title>The Dormouse's story</title></head>
<body>
...(생략)
</body></html>,
(DOC문서)<html><head><title>The Dormouse's story</title></head>
<body>
...(생략)
</body></html>]
find_next_sibling() & find_next_siblins()
find_next_sibling( name, attrs, recursives, string, limit, **kwargs )
>>> soup.find("a")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.find("a").find_next_sibling() # 해당 태그의 형제 태그 하나 검색
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
>>> soup.find("a").find_next_siblings() # 해당 태그의 형제 태그 모두 검색
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
find_next_sibling() : 해당 태그를 기준으로 아래 방향으로 태그를 검색
<-> find_previous_sibling() : 해당 태그를 기준으로 윗 방향으로 태그를 검색
find_all_next()
tag.next_element 반복사용
find_all_previous()
tag.previous_element 반복사용
[ CSS Selector ]
select = find_all()
>>> soup.select("a")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
* nth-of-type(index) : 여러개의 태그 중에서 1,2,3 ... 번째 태그를 가져올 수 있다
>>> soup.select("a:nth-of-type(1)")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
>>> soup.select("a:nth-of-type(2)")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
>>> soup.select("a:nth-of-type(3)")
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# id
>>> soup.select("#link1")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# class
>>> soup.select(".sister")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# 자식
>>> soup.select("html > head > title") # <html><head><title> ...
[<title>The Dormouse's story</title>]
>>> soup.select("body > p > #link1") # <body><p><a id="link1">
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# 후손
>>> soup.select("html body a") # <html><body><p><a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.select("html body a:nth-of-type(1)")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# 형제
>>> soup.select("#link1 ~ .sister") # id= link1인 태그와 형제관계 중 sister클래스인 모든 태그
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.select("#link1 + .sister") # id= link1인 태그와 형제관계 중 sister클래스인 첫번째 태그
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
>>> soup.select("#link1 + .sister + .sister") # id= link1인 태그와 형제관계 중 sister클래스인 첫번째 태그의 다음형제
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
정규식 활용
>>> soup.select("[class~=sister]") # class속성값에 "sister"를 포함하는 태그
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.select("a[href]") # a 태그 중, href 속성이 존재하는 태그
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.select('a[href = "http://example.com/elsie"]') # a 태그 중, 특정값이 href속성값에 일치하는 태그
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
>>> soup.select('a[href ^= "http://example.com"]') # a 태그 중, href속성값의 시작 문자열
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
>>> soup.select('a[href $= "lacie"]') # a태그 중, href속성값의 마지막 문자열
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
>>> soup.select('a[href *= "el"]') # a태그 중, href속성값에 특정 문자열이 존재하는지
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
[ Output ]
.strings() & .stripped_strings()
>>> for txt in soup.strings:
print(txt)
\n\n\n
The Dormouse's story
\n\n\n\n
The Dormouse's story
\n\n
Once upon a time there were three little sisters; and their names were
\n
Elsie
,
\n
Lacie
and
\n
Tillie
;
and they lived at the bottom of a well.
\n\n
...\n\n
해당 문서에서 \n(white space)가 많이 존재할 경우 stripped_string()함수를 사용하면 제거할 수 있다
>>> for txt in soup.stripped_strings:
print(txt)
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...
* stripped_strings 응용 ( 모든 텍스트들을 배열로 옮길 수 있다 )
>>> [text for text in soup.stripped_strings]
["The Dormouse's story", "The Dormouse's story", 'Once upon a time there were three little sisters; and their names were', 'Elsie', ',', 'Lacie', 'and', 'Tillie', ';\nand they lived at the bottom of a well.', '...']
.prettify()
>>> soup.prettify() # html형식을 살려서 옮겨 올 수 있다
'<html>\n <head>\n <title>\n The Dormouse\'s story\n </title>\n </head>\n <body>\n <p class="title">\n <b>\n The Dormouse\'s story\n </b>\n </p>\n <p class="story">\n Once upon a time there were three little sisters; and their names were\n <a class="sister" href="http://example.com/elsie" id="link1">\n Elsie\n </a>\n ,\n <a class="sister" href="http://example.com/lacie" id="link2">\n Lacie\n </a>\n and\n <a class="sister" href="http://example.com/tillie" id="link3">\n Tillie\n </a>\n ;\nand they lived at the bottom of a well.\n </p>\n <p class="story">\n ...\n </p>\n </body>\n</html>'
* html형식을 살려서 출력할 수 있다
[ Format ]
encode() & decode() , str()
>>> soup.a
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> type(soup.a)
<class 'bs4.element.Tag'
>>> soup.a.encode()
b'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'
>>> type(soup.a.encode())
<class 'bytes'>
>>>> str(soup.a)
'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'
>>> type(str(soup.a))
<class 'str'>
>>> soup.a.decode()
'<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'
>>> type(soup.a.decode())
<class 'str'>
.get_text()
>>> soup.get_text("★")
"\n★The Dormouse's story★\n★\n★The Dormouse's story★\n★Once upon a time there were three little sisters; and their names were\n★Elsie★,\n★Lacie★ and\n★Tillie★;\nand they lived at the bottom of a well.★\n★...★\n"
>>> soup.get_text("★",strip=True) # 텍스트들 사이의 \n 제거
"The Dormouse's story★The Dormouse's story★Once upon a time there were three little sisters; and their names were★Elsie★,★Lacie★and★Tillie★;\nand they lived at the bottom of a well.★..."
[ Encoding ]
>>> markup = b'''
<html>
<head>
<meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type" />
</head>
<body>
<p>Sacr\xe9 bleu!</p>
</body>
</html>
'''
>>> soup3 = BeautifulSoup(markup)
>>> soup3.prettify("utf-8")
b'<html>\n <head>\n <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>\n </head>\n <body>\n <p>\n Sacr\xc3\xa9 bleu!\n </p>\n </body>\n</html>\n'
>>> soup3.prettify("latin-1")
b'<html>\n <head>\n <meta content="text/html; charset=latin-1" http-equiv="Content-type"/>\n </head>\n <body>\n <p>\n Sacr\xe9 bleu!\n </p>\n </body>\n</html>\n'
>>> soup3.prettify("ascii")
b'<html>\n <head>\n <meta content="text/html; charset=ascii" http-equiv="Content-type"/>\n </head>\n <body>\n <p>\n Sacré bleu!\n </p>\n </body>\n</html>\n'
'Python > Programming' 카테고리의 다른 글
BeautifulSoup 예제2 음원사이트 Genie 차트 순위 긁어오기 (0) | 2018.01.24 |
---|---|
Python을 이용한 이미지 다운로드 ( urlretrieve ) (1) | 2018.01.24 |
BeautifulSoup 예제1 네이버 실시간 검색어 긁어오기 (0) | 2018.01.24 |
BeatifulSoup (1) 기본 메서드 (0) | 2018.01.16 |
Python HTTP Request & Response 실습( urllib module ) (0) | 2018.01.16 |