Coverage for src / crawler / by_source / mta_crawler.py: 11%
68 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1from bs4 import BeautifulSoup
2from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
4from crawler.matching_crawler import MatchingCrawler
5from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
8# volume 10 issue 2 (2025)
9# volume 1 No 1 (2016)
10class MtaCrawler(MatchingCrawler):
11 source_name = "Minimax Theory and its Applications website"
12 source_domain = "MTA"
13 source_website = "https://journalmta.com"
15 def parse_collection_content(self, content):
16 xissues = []
17 soup = BeautifulSoup(content, "html5lib")
18 xissues_tags = soup.select(".issues_archive .obj_issue_summary .title")
19 for tag in xissues_tags:
20 issue_dict = regex_to_dict(
21 pattern=r"volume (?P<volume>\d+) (issue|No) (?P<issue>\d+) \((?P<year>\d{4})\)",
22 value=cleanup_str(tag.text),
23 )
24 xissue = self.create_xissue(
25 self.get_str_attr(tag, "href"),
26 issue_dict["year"],
27 issue_dict["volume"],
28 issue_dict["issue"],
29 )
31 xissues.append(xissue)
32 return xissues
34 def parse_issue_content(self, content, xissue):
35 soup = BeautifulSoup(content, "html5lib")
37 # published
38 published_tag = soup.select_one(".published .value")
39 if published_tag:
40 xissue.date_published = cleanup_str(published_tag.text)
42 for i, article_tag in enumerate(soup.select(".articles .obj_article_summary .title a")):
43 xarticle = create_articledata()
44 xarticle.pid = f"a{i}"
45 xarticle.url = self.get_str_attr(article_tag, "href")
47 pages_tag = article_tag.select_one(".pages")
48 if pages_tag:
49 pages = pages_tag.text.split("–")
50 if len(pages) < 2:
51 raise ValueError(f"Couldn't parse pages : {pages_tag.text}")
52 xarticle.fpage = pages[0]
53 xarticle.lpage = pages[1]
54 xissue.articles.append(xarticle)
56 def parse_article_content(self, content, xissue, xarticle, url):
57 # TODO : use citation_meta instead ?
58 soup = BeautifulSoup(content, "html5lib")
59 soup = soup.select_one(".obj_article_details")
60 if not soup:
61 raise ValueError("Couldn't find article page")
63 # title
64 title_tag = soup.select_one(".page_title")
65 if not title_tag:
66 raise ValueError("Couldn't find article title")
67 xarticle.title_tex = cleanup_str(title_tag.text)
69 # authors
70 authors_tag = soup.select_one(".authors .name")
71 if not authors_tag:
72 raise ValueError("Couldn't find authors")
73 for author in cleanup_str(authors_tag.text).split(", "):
74 xarticle.contributors.append(create_contributor(role="author", string_name=author))
76 # keywords
77 keywords_tag = soup.select_one(".keywords .value")
78 if not keywords_tag:
79 raise ValueError("Couldn't find keywords")
80 for kwd in cleanup_str(keywords_tag.text).removesuffix(".").split(", "):
81 xarticle.kwds.append(create_subj(value=kwd))
83 # abstract
84 abstract_tag = soup.select_one(".abstract")
85 if not abstract_tag:
86 raise ValueError("Couldn't find abstract")
87 abstract_header = abstract_tag.select_one(".label")
88 if abstract_header:
89 abstract_header.decompose()
90 # test = CkeditorParser(
91 # html_value=abstract_tag.text,
92 # mml_formulas="",
93 # )
95 # abstract = create_abstract(
96 # lang="en",
97 # value_xml=get_abstract_xml(test.value_xml, lang="en"),
98 # value_tex=test.value_tex,
99 # value_html=test.value_html,
100 # )
101 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text)))
102 pdf_tag = soup.select_one(".pdf")
103 if not pdf_tag:
104 self.logger.warning(f"Couldn't find pdf for {xarticle.url}")
105 return
106 pdf_url = self.get_str_attr(pdf_tag, "href")
107 pdf_url = pdf_url.replace("article/view/", "article/download/")
108 add_pdf_link_to_xarticle(xarticle, pdf_url)
110 return xarticle