Browse Source

# feat:爬取文章的脚本;
- 将对应链接的title和.article文章内容爬取保存到文件中

yang yi 1 week ago
parent
commit
7e32d7fdb5
1 changed files with 78 additions and 0 deletions
  1. 78 0
      文章/artUtil.py

+ 78 - 0
文章/artUtil.py

@@ -0,0 +1,78 @@
+import re
+import urllib.request
+from markdownify import markdownify as md
+from bs4 import BeautifulSoup
+
+
+def get(url):
+    try:
+        request = urllib.request.Request(url)
+        response = urllib.request.urlopen(request)
+        text = response.read().decode("utf-8")
+        return text
+    except urllib.error.HTTPError as e:
+        return f"HTTP Error {e.code}"
+    except Exception as e:
+        return str(e)
+
+
+def htmlHandler(html):
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # 提取title
+    title = soup.title.string if soup.title else "No Title"
+
+    # 提取.article部分
+    article_element = soup.select_one('.article')
+    if article_element:
+        content = str(article_element)
+    else:
+        content = html
+
+    return title, content
+
+
+def html2markdown(html):
+    # 使用markdownify将HTML转换为Markdown,保留格式
+    return md(html,
+              heading_style="ATX",
+              bullets="-",
+              strip=['script', 'style'])
+
+
+def postMarkdownHandler(markdown):
+    # 在所有图片链接前加上"https://www-new.gwng.edu.cn/"
+    # 匹配markdown中的图片语法 ![alt](src)
+    def add_base_url(match):
+        alt_text = match.group(1)
+        img_url = match.group(2)
+        # 如果URL已经是完整链接,不添加前缀
+        if img_url.startswith('http://') or img_url.startswith('https://'):
+            return f'![{alt_text}]({img_url})'
+        # 如果URL已经是相对路径且不以斜杠开头,添加斜杠
+        elif not img_url.startswith('/'):
+            return f'![{alt_text}](https://www-new.gwng.edu.cn/{img_url})'
+        else:
+            return f'![{alt_text}](https://www-new.gwng.edu.cn{img_url})'
+
+    # 匹配图片语法
+    pattern = r'!\[(.*?)\]\((.*?)\)'
+    markdown = re.sub(pattern, add_base_url, markdown)
+
+    return markdown
+
+
+def save(filename, content):
+    import re
+    # 清理文件名中的非法字符
+    filename = re.sub(r'[\\/*?:"<>|]', '_', filename)
+    with open(f"{filename}.md", "w", encoding="utf-8") as f:
+        f.write(content)
+
+# 每次使用将url修改为文章对应的地址
+url = 'https://www-new.gwng.edu.cn/xkxy/2020/0724/c1287a48158/page.psp'
+html = get(url)
+title, content = htmlHandler(html)
+markdown = html2markdown(content)
+markdown = postMarkdownHandler(markdown)
+save(title, markdown)