Coverage for src/crawler/by_source/emis_hoa_crawler.py: 13%

120 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml 

6from ptf.cmds.xml.jats.jats_parser import JatsBase 

7from ptf.cmds.xml.xml_utils import escape 

8from ptf.model_data import ( 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

17 

18 

19class Emis_hoaCrawler(BaseCollectionCrawler): 

20 source_name = "European Mathematical Information Service" 

21 source_domain = "EMIS_HOA" 

22 source_website = "https://www.emis.de" 

23 

24 issue_re = regex.compile( 

25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?" 

26 ) 

27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)") 

28 

29 verify = False 

30 

31 @classmethod 

32 def get_view_id(cls): 

33 return "EMIS" 

34 

35 def parse_collection_content(self, content): 

36 xissues = [] 

37 

38 soup = BeautifulSoup(content, "html5lib") 

39 a_tags = soup.select("a[href^='Volume']") 

40 for a_tag in a_tags: 

41 href = a_tag.get("href") 

42 if not isinstance(href, str): 

43 raise ValueError("Couldn't parse ") 

44 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href))) 

45 return xissues 

46 

47 # Volumes and issues are defined inside the issue pages 

48 # No choice but to fetch everything in parse_collection_content 

49 def prefetch_aaa_issues(self, link: str): 

50 """ 

51 Parses one AAA Volume page and returns an iterable of issues 

52 """ 

53 xissues_dict: dict[str, IssueData] = {} 

54 content = self.download_file(link) 

55 soup = BeautifulSoup(content, "html5lib") 

56 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li") 

57 for index, a_tag in enumerate(article_tags): 

58 href = a_tag.select_one("a").get("href") 

59 if not isinstance(href, str): 

60 raise ValueError("Couldn't parse article link") 

61 

62 xarticle = create_articledata() 

63 xarticle.pid = "a" + str(index) 

64 xarticle.url = urljoin(link, href) 

65 

66 issue_search = self.issue_re.search(cleanup_str(a_tag.text)) 

67 if not issue_search: 

68 raise ValueError("Couldn't parse issue data") 

69 issue_data = issue_search.groupdict() 

70 

71 issue_pid = self.get_issue_pid( 

72 self.collection_id, 

73 issue_data["year"], 

74 issue_data["volume"], 

75 issue_data.get("number", None), 

76 ) 

77 

78 if issue_pid not in xissues_dict: 

79 xissues_dict[issue_pid] = self.create_xissue( 

80 link, 

81 year=issue_data["year"], 

82 volume_number=issue_data["volume"], 

83 issue_number=issue_data.get("number", None), 

84 ) 

85 

86 xissues_dict[issue_pid].articles.append(xarticle) 

87 return xissues_dict.values() 

88 

89 def parse_article_content(self, content, xissue, xarticle, url): 

90 soup = BeautifulSoup(content, "html5lib") 

91 

92 title_tag = soup.select_one(".middle_content h2") 

93 xarticle.title_tex = cleanup_str(title_tag.text) 

94 

95 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp") 

96 

97 if authors_tag: 

98 # Remove affiliations from author links 

99 # RFE : parse author affiliations 

100 sup_tags = authors_tag.select("sup") 

101 for sup in sup_tags: 

102 sup.decompose() 

103 del sup_tags 

104 

105 authors_str = cleanup_str(authors_tag.text) 

106 authors_str.replace(", and ", ", ") 

107 for author in authors_str.split(", "): 

108 if cleanup_str(author) == "": 

109 raise ValueError("Invalid author") 

110 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

111 

112 doi_tag = soup.select_one(".middle_content pre") 

113 if doi_tag: 

114 doi_search = self.doi_re.search(doi_tag.text) 

115 if doi_search: 

116 doi = doi_search.group("doi") 

117 xarticle.doi = doi 

118 

119 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')") 

120 if abstract_header: 

121 abstract_tag = abstract_header.parent.select_one("p") 

122 if abstract_tag: 

123 xarticle.abstracts.append( 

124 create_abstract( 

125 lang=xarticle.lang, 

126 value_tex=escape(cleanup_str(abstract_tag.text)), 

127 ) 

128 ) 

129 

130 references_header = soup.select_one("h4:-soup-contains-own('References')") 

131 if references_header: 

132 references_tags = references_header.parent.select("ol > li") 

133 

134 for ref_tag in references_tags: 

135 xarticle.bibitems.append(self.parse_ref(ref_tag)) 

136 

137 pdf_tag = soup.select_one("a.full_text_pdf") 

138 

139 pdf_href = pdf_tag.get("href") 

140 if not isinstance(pdf_href, str): 

141 raise ValueError("Couldn't parse pdf href") 

142 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href)) 

143 return xarticle 

144 

145 def parse_ref(self, tag: Tag): 

146 value_xml = "" 

147 for el in tag.children: 

148 if isinstance(el, Tag): 

149 if el.name == "i": 

150 value_xml += get_article_title_xml(cleanup_str(el.text)) 

151 continue 

152 

153 if "reflinks" in (el.get("class", None) or []): 

154 value_xml += self.parse_ref_reflinks(el) 

155 continue 

156 continue 

157 

158 if isinstance(el, str): 

159 value_xml += el 

160 continue 

161 

162 return JatsBase.bake_ref(value_xml) 

163 

164 def parse_ref_reflinks(self, reflink_tag: Tag): 

165 value_xml = "" 

166 for link in reflink_tag.children: 

167 if not isinstance(link, Tag): 

168 continue 

169 ref_href = link.get("href") 

170 if not isinstance(ref_href, str): 

171 continue 

172 ref_href = escape(ref_href) 

173 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"): 

174 value_xml += get_ext_link_xml( 

175 ref_href, 

176 ref_href.removeprefix( 

177 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

178 ), 

179 "zbl-item-id", 

180 ) 

181 continue 

182 if ref_href.startswith("http://dx.doi.org/"): 

183 value_xml += get_ext_link_xml( 

184 ref_href, 

185 ref_href.removeprefix("http://dx.doi.org/"), 

186 "doi", 

187 ) 

188 continue 

189 value_xml += get_ext_link_xml( 

190 ref_href, 

191 escape(cleanup_str(link.text)), 

192 "uri", 

193 ) 

194 

195 return value_xml