gwng
/
gwng-pythonweb


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
							import re
import urllib.request
from markdownify import markdownify as md
from bs4 import BeautifulSoup


def get(url):
    try:
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        text = response.read().decode("utf-8")
        return text
    except urllib.error.HTTPError as e:
        return f"HTTP Error {e.code}"
    except Exception as e:
        return str(e)


def htmlHandler(html):
    soup = BeautifulSoup(html, 'html.parser')

    # 提取title
    title = soup.title.string if soup.title else "No Title"

    # 提取.article部分
    article_element = soup.select_one('.article')
    if article_element:
        content = str(article_element)
    else:
        content = html

    return title, content


def html2markdown(html):
    # 使用markdownify将HTML转换为Markdown，保留格式
    return md(html,
              heading_style="ATX",
              bullets="-",
              strip=['script', 'style'])


def postMarkdownHandler(markdown):
    # 在所有图片链接前加上"https://www-new.gwng.edu.cn/"
    # 匹配markdown中的图片语法 ![alt](src)
    def add_base_url(match):
        alt_text = match.group(1)
        img_url = match.group(2)
        # 如果URL已经是完整链接，不添加前缀
        if img_url.startswith('http://') or img_url.startswith('https://'):
            return f'![{alt_text}]({img_url})'
        # 如果URL已经是相对路径且不以斜杠开头，添加斜杠
        elif not img_url.startswith('/'):
            return f'![{alt_text}](https://www-new.gwng.edu.cn/{img_url})'
        else:
            return f'![{alt_text}](https://www-new.gwng.edu.cn{img_url})'

    # 匹配图片语法
    pattern = r'!\[(.*?)\]\((.*?)\)'
    markdown = re.sub(pattern, add_base_url, markdown)

    return markdown


def save(filename, content):
    import re
    # 清理文件名中的非法字符
    filename = re.sub(r'[\\/*?:"<>|]', '_', filename)
    with open(f"{filename}.md", "w", encoding="utf-8") as f:
        f.write(content)

# 每次使用将url修改为文章对应的地址
url = 'https://www-new.gwng.edu.cn/xkxy/2020/0724/c1287a48158/page.psp'
html = get(url)
title, content = htmlHandler(html)
markdown = html2markdown(content)
markdown = postMarkdownHandler(markdown)
save(title, markdown)