327 lines
12 KiB
Python
327 lines
12 KiB
Python
"""Tests for the fetch MCP server."""
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
from mcp.shared.exceptions import McpError
|
|
|
|
from mcp_server_fetch.server import (
|
|
extract_content_from_html,
|
|
get_robots_txt_url,
|
|
check_may_autonomously_fetch_url,
|
|
fetch_url,
|
|
DEFAULT_USER_AGENT_AUTONOMOUS,
|
|
)
|
|
|
|
|
|
class TestGetRobotsTxtUrl:
|
|
"""Tests for get_robots_txt_url function."""
|
|
|
|
def test_simple_url(self):
|
|
"""Test with a simple URL."""
|
|
result = get_robots_txt_url("https://example.com/page")
|
|
assert result == "https://example.com/robots.txt"
|
|
|
|
def test_url_with_path(self):
|
|
"""Test with URL containing path."""
|
|
result = get_robots_txt_url("https://example.com/some/deep/path/page.html")
|
|
assert result == "https://example.com/robots.txt"
|
|
|
|
def test_url_with_query_params(self):
|
|
"""Test with URL containing query parameters."""
|
|
result = get_robots_txt_url("https://example.com/page?foo=bar&baz=qux")
|
|
assert result == "https://example.com/robots.txt"
|
|
|
|
def test_url_with_port(self):
|
|
"""Test with URL containing port number."""
|
|
result = get_robots_txt_url("https://example.com:8080/page")
|
|
assert result == "https://example.com:8080/robots.txt"
|
|
|
|
def test_url_with_fragment(self):
|
|
"""Test with URL containing fragment."""
|
|
result = get_robots_txt_url("https://example.com/page#section")
|
|
assert result == "https://example.com/robots.txt"
|
|
|
|
def test_http_url(self):
|
|
"""Test with HTTP URL."""
|
|
result = get_robots_txt_url("http://example.com/page")
|
|
assert result == "http://example.com/robots.txt"
|
|
|
|
|
|
class TestExtractContentFromHtml:
|
|
"""Tests for extract_content_from_html function."""
|
|
|
|
def test_simple_html(self):
|
|
"""Test with simple HTML content."""
|
|
html = """
|
|
<html>
|
|
<head><title>Test Page</title></head>
|
|
<body>
|
|
<article>
|
|
<h1>Hello World</h1>
|
|
<p>This is a test paragraph.</p>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
result = extract_content_from_html(html)
|
|
# readabilipy may extract different parts depending on the content
|
|
assert "test paragraph" in result
|
|
|
|
def test_html_with_links(self):
|
|
"""Test that links are converted to markdown."""
|
|
html = """
|
|
<html>
|
|
<body>
|
|
<article>
|
|
<p>Visit <a href="https://example.com">Example</a> for more.</p>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
result = extract_content_from_html(html)
|
|
assert "Example" in result
|
|
|
|
def test_empty_content_returns_error(self):
|
|
"""Test that empty/invalid HTML returns error message."""
|
|
html = ""
|
|
result = extract_content_from_html(html)
|
|
assert "<error>" in result
|
|
|
|
|
|
class TestCheckMayAutonomouslyFetchUrl:
|
|
"""Tests for check_may_autonomously_fetch_url function."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_allows_when_robots_txt_404(self):
|
|
"""Test that fetching is allowed when robots.txt returns 404."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 404
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
# Should not raise
|
|
await check_may_autonomously_fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_blocks_when_robots_txt_401(self):
|
|
"""Test that fetching is blocked when robots.txt returns 401."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 401
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
with pytest.raises(McpError):
|
|
await check_may_autonomously_fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_blocks_when_robots_txt_403(self):
|
|
"""Test that fetching is blocked when robots.txt returns 403."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 403
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
with pytest.raises(McpError):
|
|
await check_may_autonomously_fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_allows_when_robots_txt_allows_all(self):
|
|
"""Test that fetching is allowed when robots.txt allows all."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.text = "User-agent: *\nAllow: /"
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
# Should not raise
|
|
await check_may_autonomously_fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_blocks_when_robots_txt_disallows_all(self):
|
|
"""Test that fetching is blocked when robots.txt disallows all."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.text = "User-agent: *\nDisallow: /"
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
with pytest.raises(McpError):
|
|
await check_may_autonomously_fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
|
|
class TestFetchUrl:
|
|
"""Tests for fetch_url function."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_html_page(self):
|
|
"""Test fetching an HTML page returns markdown content."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.text = """
|
|
<html>
|
|
<body>
|
|
<article>
|
|
<h1>Test Page</h1>
|
|
<p>Hello World</p>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"""
|
|
mock_response.headers = {"content-type": "text/html"}
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
content, prefix = await fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
# HTML is processed, so we check it returns something
|
|
assert isinstance(content, str)
|
|
assert prefix == ""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_html_page_raw(self):
|
|
"""Test fetching an HTML page with raw=True returns original HTML."""
|
|
html_content = "<html><body><h1>Test</h1></body></html>"
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.text = html_content
|
|
mock_response.headers = {"content-type": "text/html"}
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
content, prefix = await fetch_url(
|
|
"https://example.com/page",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS,
|
|
force_raw=True
|
|
)
|
|
|
|
assert content == html_content
|
|
assert "cannot be simplified" in prefix
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_json_returns_raw(self):
|
|
"""Test fetching JSON content returns raw content."""
|
|
json_content = '{"key": "value"}'
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.text = json_content
|
|
mock_response.headers = {"content-type": "application/json"}
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
content, prefix = await fetch_url(
|
|
"https://api.example.com/data",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
assert content == json_content
|
|
assert "cannot be simplified" in prefix
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_404_raises_error(self):
|
|
"""Test that 404 response raises McpError."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 404
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
with pytest.raises(McpError):
|
|
await fetch_url(
|
|
"https://example.com/notfound",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_500_raises_error(self):
|
|
"""Test that 500 response raises McpError."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 500
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
with pytest.raises(McpError):
|
|
await fetch_url(
|
|
"https://example.com/error",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fetch_with_proxy(self):
|
|
"""Test that proxy URL is passed to client."""
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.text = '{"data": "test"}'
|
|
mock_response.headers = {"content-type": "application/json"}
|
|
|
|
with patch("httpx.AsyncClient") as mock_client_class:
|
|
mock_client = AsyncMock()
|
|
mock_client.get = AsyncMock(return_value=mock_response)
|
|
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
|
|
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
await fetch_url(
|
|
"https://example.com/data",
|
|
DEFAULT_USER_AGENT_AUTONOMOUS,
|
|
proxy_url="http://proxy.example.com:8080"
|
|
)
|
|
|
|
# Verify AsyncClient was called with proxy
|
|
mock_client_class.assert_called_once_with(proxies="http://proxy.example.com:8080")
|