Coverage for src / crawler / by_source / mta_crawler.py: 11%

68 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1from bs4 import BeautifulSoup 

2from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

3 

4from crawler.matching_crawler import MatchingCrawler 

5from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

6 

7 

8# volume 10 issue 2 (2025) 

9# volume 1 No 1 (2016) 

10class MtaCrawler(MatchingCrawler): 

11 source_name = "Minimax Theory and its Applications website" 

12 source_domain = "MTA" 

13 source_website = "https://journalmta.com" 

14 

15 def parse_collection_content(self, content): 

16 xissues = [] 

17 soup = BeautifulSoup(content, "html5lib") 

18 xissues_tags = soup.select(".issues_archive .obj_issue_summary .title") 

19 for tag in xissues_tags: 

20 issue_dict = regex_to_dict( 

21 pattern=r"volume (?P<volume>\d+) (issue|No) (?P<issue>\d+) \((?P<year>\d{4})\)", 

22 value=cleanup_str(tag.text), 

23 ) 

24 xissue = self.create_xissue( 

25 self.get_str_attr(tag, "href"), 

26 issue_dict["year"], 

27 issue_dict["volume"], 

28 issue_dict["issue"], 

29 ) 

30 

31 xissues.append(xissue) 

32 return xissues 

33 

34 def parse_issue_content(self, content, xissue): 

35 soup = BeautifulSoup(content, "html5lib") 

36 

37 # published 

38 published_tag = soup.select_one(".published .value") 

39 if published_tag: 

40 xissue.date_published = cleanup_str(published_tag.text) 

41 

42 for i, article_tag in enumerate(soup.select(".articles .obj_article_summary .title a")): 

43 xarticle = create_articledata() 

44 xarticle.pid = f"a{i}" 

45 xarticle.url = self.get_str_attr(article_tag, "href") 

46 

47 pages_tag = article_tag.select_one(".pages") 

48 if pages_tag: 

49 pages = pages_tag.text.split("–") 

50 if len(pages) < 2: 

51 raise ValueError(f"Couldn't parse pages : {pages_tag.text}") 

52 xarticle.fpage = pages[0] 

53 xarticle.lpage = pages[1] 

54 xissue.articles.append(xarticle) 

55 

56 def parse_article_content(self, content, xissue, xarticle, url): 

57 # TODO : use citation_meta instead ? 

58 soup = BeautifulSoup(content, "html5lib") 

59 soup = soup.select_one(".obj_article_details") 

60 if not soup: 

61 raise ValueError("Couldn't find article page") 

62 

63 # title 

64 title_tag = soup.select_one(".page_title") 

65 if not title_tag: 

66 raise ValueError("Couldn't find article title") 

67 xarticle.title_tex = cleanup_str(title_tag.text) 

68 

69 # authors 

70 authors_tag = soup.select_one(".authors .name") 

71 if not authors_tag: 

72 raise ValueError("Couldn't find authors") 

73 for author in cleanup_str(authors_tag.text).split(", "): 

74 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

75 

76 # keywords 

77 keywords_tag = soup.select_one(".keywords .value") 

78 if not keywords_tag: 

79 raise ValueError("Couldn't find keywords") 

80 for kwd in cleanup_str(keywords_tag.text).removesuffix(".").split(", "): 

81 xarticle.kwds.append(create_subj(value=kwd)) 

82 

83 # abstract 

84 abstract_tag = soup.select_one(".abstract") 

85 if not abstract_tag: 

86 raise ValueError("Couldn't find abstract") 

87 abstract_header = abstract_tag.select_one(".label") 

88 if abstract_header: 

89 abstract_header.decompose() 

90 # test = CkeditorParser( 

91 # html_value=abstract_tag.text, 

92 # mml_formulas="", 

93 # ) 

94 

95 # abstract = create_abstract( 

96 # lang="en", 

97 # value_xml=get_abstract_xml(test.value_xml, lang="en"), 

98 # value_tex=test.value_tex, 

99 # value_html=test.value_html, 

100 # ) 

101 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text))) 

102 pdf_tag = soup.select_one(".pdf") 

103 if not pdf_tag: 

104 self.logger.warning(f"Couldn't find pdf for {xarticle.url}") 

105 return 

106 pdf_url = self.get_str_attr(pdf_tag, "href") 

107 pdf_url = pdf_url.replace("article/view/", "article/download/") 

108 add_pdf_link_to_xarticle(xarticle, pdf_url) 

109 

110 return xarticle