Coverage for src/crawler/by_source/emis_hoa_crawler.py: 12%

120 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml 

6from ptf.cmds.xml.jats.jats_parser import JatsBase 

7from ptf.cmds.xml.xml_utils import escape 

8from ptf.model_data import ( 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

17 

18 

19class Emis_hoaCrawler(BaseCollectionCrawler): 

20 source_name = "European Mathematical Information Service" 

21 source_domain = "EMIS_HOA" 

22 source_website = "https://www.emis.de" 

23 

24 issue_re = regex.compile( 

25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?" 

26 ) 

27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)") 

28 

29 verify = False 

30 

31 def parse_collection_content(self, content): 

32 xissues = [] 

33 

34 soup = BeautifulSoup(content, "html5lib") 

35 a_tags = soup.select("a[href^='Volume']") 

36 for a_tag in a_tags: 

37 href = a_tag.get("href") 

38 if not isinstance(href, str): 

39 raise ValueError("Couldn't parse ") 

40 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href))) 

41 return xissues 

42 

43 # Volumes and issues are defined inside the issue pages 

44 # No choice but to fetch everything in parse_collection_content 

45 def prefetch_aaa_issues(self, link: str): 

46 """ 

47 Parses one AAA Volume page and returns an iterable of issues 

48 """ 

49 xissues_dict: dict[str, IssueData] = {} 

50 content = self.download_file(link) 

51 soup = BeautifulSoup(content, "html5lib") 

52 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li") 

53 for index, a_tag in enumerate(article_tags): 

54 href = a_tag.select_one("a").get("href") 

55 if not isinstance(href, str): 

56 raise ValueError("Couldn't parse article link") 

57 

58 xarticle = create_articledata() 

59 xarticle.pid = "a" + str(index) 

60 xarticle.url = urljoin(link, href) 

61 

62 issue_search = self.issue_re.search(cleanup_str(a_tag.text)) 

63 if not issue_search: 

64 raise ValueError("Couldn't parse issue data") 

65 issue_data = issue_search.groupdict() 

66 

67 issue_pid = self.get_issue_pid( 

68 self.collection_id, 

69 issue_data["year"], 

70 issue_data["volume"], 

71 issue_data.get("number", None), 

72 ) 

73 

74 if issue_pid not in xissues_dict: 

75 xissues_dict[issue_pid] = self.create_xissue( 

76 link, 

77 year=issue_data["year"], 

78 volume_number=issue_data["volume"], 

79 issue_number=issue_data.get("number", None), 

80 ) 

81 

82 xissues_dict[issue_pid].articles.append(xarticle) 

83 return xissues_dict.values() 

84 

85 def parse_article_content(self, content, xissue, xarticle, url): 

86 soup = BeautifulSoup(content, "html5lib") 

87 

88 title_tag = soup.select_one(".middle_content h2") 

89 xarticle.title_tex = cleanup_str(title_tag.text) 

90 

91 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp") 

92 

93 if authors_tag: 

94 # Remove affiliations from author links 

95 # RFE : parse author affiliations 

96 sup_tags = authors_tag.select("sup") 

97 for sup in sup_tags: 

98 sup.decompose() 

99 del sup_tags 

100 

101 authors_str = cleanup_str(authors_tag.text) 

102 authors_str.replace(", and ", ", ") 

103 for author in authors_str.split(", "): 

104 if cleanup_str(author) == "": 

105 raise ValueError("Invalid author") 

106 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

107 

108 doi_tag = soup.select_one(".middle_content pre") 

109 if doi_tag: 

110 doi_search = self.doi_re.search(doi_tag.text) 

111 if doi_search: 

112 doi = doi_search.group("doi") 

113 xarticle.doi = doi 

114 

115 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')") 

116 if abstract_header: 

117 abstract_tag = abstract_header.parent.select_one("p") 

118 if abstract_tag: 

119 xarticle.abstracts.append( 

120 create_abstract( 

121 tag="abstract", 

122 lang=xarticle.lang, 

123 value_tex=escape(cleanup_str(abstract_tag.text)), 

124 ) 

125 ) 

126 

127 references_header = soup.select_one("h4:-soup-contains-own('References')") 

128 if references_header: 

129 references_tags = references_header.parent.select("ol > li") 

130 bibitems = [] 

131 for ref_tag in references_tags: 

132 bibitems.append(self.parse_ref(ref_tag)) 

133 if len(bibitems) > 0: 

134 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

135 

136 pdf_tag = soup.select_one("a.full_text_pdf") 

137 

138 pdf_href = pdf_tag.get("href") 

139 if not isinstance(pdf_href, str): 

140 raise ValueError("Couldn't parse pdf href") 

141 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href)) 

142 return xarticle 

143 

144 def parse_ref(self, tag: Tag): 

145 value_xml = "" 

146 for el in tag.children: 

147 if isinstance(el, Tag): 

148 if el.name == "i": 

149 value_xml += get_article_title_xml(cleanup_str(el.text)) 

150 continue 

151 

152 if "reflinks" in (el.get("class", []) or []): 

153 value_xml += self.parse_ref_reflinks(el) 

154 continue 

155 continue 

156 

157 if isinstance(el, str): 

158 value_xml += el 

159 continue 

160 

161 return JatsBase.bake_ref(value_xml) 

162 

163 def parse_ref_reflinks(self, reflink_tag: Tag): 

164 value_xml = "" 

165 for link in reflink_tag.children: 

166 if not isinstance(link, Tag): 

167 continue 

168 ref_href = link.get("href") 

169 if not isinstance(ref_href, str): 

170 continue 

171 ref_href = escape(ref_href) 

172 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"): 

173 value_xml += get_ext_link_xml( 

174 ref_href, 

175 ref_href.removeprefix( 

176 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

177 ), 

178 "zbl-item-id", 

179 ) 

180 continue 

181 if ref_href.startswith("http://dx.doi.org/"): 

182 value_xml += get_ext_link_xml( 

183 ref_href, 

184 ref_href.removeprefix("http://dx.doi.org/"), 

185 "doi", 

186 ) 

187 continue 

188 value_xml += get_ext_link_xml( 

189 ref_href, 

190 escape(cleanup_str(link.text)), 

191 "uri", 

192 ) 

193 

194 return value_xml