Após a aula em que o professor demonstrou como extrair o HTML de um site em:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://alura-site-scraping.herokuapp.com/hello-world.php'
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
#print(soup.find('div', class="dataTables_scroll").get_text())
print(soup)
me aventurei em extrair o HTML da home page de um site de competições de Magic The Gathering:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://mtgmelee.com/'
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
#print(soup.find('div', class="dataTables_scroll").get_text())
print(soup)
entretanto o código está retornando um erro, que só deve indicar algum problema relativo ao site:
C:\Users\mfsra_m8bvnzn\PycharmProjects\MTGproj\venv\Scripts\python.exe C:/Users/mfsra_m8bvnzn/PycharmProjects/MTGproj/scraper.py
Traceback (most recent call last):
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 1348, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1276, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1322, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1271, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1031, in _send_output
self.send(msg)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 969, in send
self.connect()
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1448, in connect
self.sock = self._context.wrap_socket(self.sock,
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 512, in wrap_socket
return self.sslsocket_class._create(
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1070, in _create
self.do_handshake()
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1341, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:997)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\mfsra_m8bvnzn\PycharmProjects\MTGproj\scraper.py", line 6, in <module>
response = urlopen(url)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 216, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 519, in open
response = self._open(req, data)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 536, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 496, in _call_chain
result = func(*args)
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 1391, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Users\mfsra_m8bvnzn\AppData\Local\Programs\Python\Python310\lib\urllib\request.py", line 1351, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:997)>
Eu não sei como corrigir esse problema