Coverage for src / crawler / by_source / emis_hoa_crawler.py: 14%

121 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml 

6from ptf.cmds.xml.jats.jats_parser import JatsBase 

7from ptf.cmds.xml.xml_utils import escape 

8from ptf.model_data import ( 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.crawler_utils import get_issue_pid 

17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

18 

19 

20class Emis_hoaCrawler(BaseCollectionCrawler): 

21 source_name = "European Mathematical Information Service" 

22 source_domain = "EMIS_HOA" 

23 source_website = "https://www.emis.de" 

24 

25 issue_re = regex.compile( 

26 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?" 

27 ) 

28 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)") 

29 

30 verify = False 

31 

32 @classmethod 

33 def get_view_id(cls): 

34 return "EMIS" 

35 

36 def parse_collection_content(self, content): 

37 xissues = [] 

38 

39 soup = BeautifulSoup(content, "html5lib") 

40 a_tags = soup.select("a[href^='Volume']") 

41 for a_tag in a_tags: 

42 href = a_tag.get("href") 

43 if not isinstance(href, str): 

44 raise ValueError("Couldn't parse ") 

45 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href))) 

46 return xissues 

47 

48 # Volumes and issues are defined inside the issue pages 

49 # No choice but to fetch everything in parse_collection_content 

50 def prefetch_aaa_issues(self, link: str): 

51 """ 

52 Parses one AAA Volume page and returns an iterable of issues 

53 """ 

54 xissues_dict: dict[str, IssueData] = {} 

55 content = self.download_file(link) 

56 soup = BeautifulSoup(content, "html5lib") 

57 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li") 

58 for index, a_tag in enumerate(article_tags): 

59 href = a_tag.select_one("a").get("href") 

60 if not isinstance(href, str): 

61 raise ValueError("Couldn't parse article link") 

62 

63 xarticle = create_articledata() 

64 xarticle.pid = "a" + str(index) 

65 xarticle.url = urljoin(link, href) 

66 

67 issue_search = self.issue_re.search(cleanup_str(a_tag.text)) 

68 if not issue_search: 

69 raise ValueError("Couldn't parse issue data") 

70 issue_data = issue_search.groupdict() 

71 

72 issue_pid = get_issue_pid( 

73 self.collection_id, 

74 issue_data["year"], 

75 issue_data["volume"], 

76 issue_data.get("number", None), 

77 ) 

78 

79 if issue_pid not in xissues_dict: 

80 xissues_dict[issue_pid] = self.create_xissue( 

81 link, 

82 year=issue_data["year"], 

83 volume_number=issue_data["volume"], 

84 issue_number=issue_data.get("number", None), 

85 ) 

86 

87 xissues_dict[issue_pid].articles.append(xarticle) 

88 return xissues_dict.values() 

89 

90 def parse_article_content(self, content, xissue, xarticle, url): 

91 soup = BeautifulSoup(content, "html5lib") 

92 

93 title_tag = soup.select_one(".middle_content h2") 

94 xarticle.title_tex = cleanup_str(title_tag.text) 

95 

96 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp") 

97 

98 if authors_tag: 

99 # Remove affiliations from author links 

100 # RFE : parse author affiliations 

101 sup_tags = authors_tag.select("sup") 

102 for sup in sup_tags: 

103 sup.decompose() 

104 del sup_tags 

105 

106 authors_str = cleanup_str(authors_tag.text) 

107 authors_str.replace(", and ", ", ") 

108 for author in authors_str.split(", "): 

109 if cleanup_str(author) == "": 

110 raise ValueError("Invalid author") 

111 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

112 

113 doi_tag = soup.select_one(".middle_content pre") 

114 if doi_tag: 

115 doi_search = self.doi_re.search(doi_tag.text) 

116 if doi_search: 

117 doi = doi_search.group("doi") 

118 xarticle.doi = doi 

119 

120 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')") 

121 if abstract_header: 

122 abstract_tag = abstract_header.parent.select_one("p") 

123 if abstract_tag: 

124 xarticle.abstracts.append( 

125 create_abstract( 

126 lang=xarticle.lang, 

127 value_tex=escape(cleanup_str(abstract_tag.text)), 

128 ) 

129 ) 

130 

131 references_header = soup.select_one("h4:-soup-contains-own('References')") 

132 if references_header: 

133 references_tags = references_header.parent.select("ol > li") 

134 

135 for ref_tag in references_tags: 

136 xarticle.bibitems.append(self.parse_ref(ref_tag)) 

137 

138 pdf_tag = soup.select_one("a.full_text_pdf") 

139 

140 pdf_href = pdf_tag.get("href") 

141 if not isinstance(pdf_href, str): 

142 raise ValueError("Couldn't parse pdf href") 

143 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href)) 

144 return xarticle 

145 

146 def parse_ref(self, tag: Tag): 

147 value_xml = "" 

148 for el in tag.children: 

149 if isinstance(el, Tag): 

150 if el.name == "i": 

151 value_xml += get_article_title_xml(cleanup_str(el.text)) 

152 continue 

153 

154 if "reflinks" in (el.get("class", None) or []): 

155 value_xml += self.parse_ref_reflinks(el) 

156 continue 

157 continue 

158 

159 if isinstance(el, str): 

160 value_xml += el 

161 continue 

162 

163 return JatsBase.bake_ref(value_xml) 

164 

165 def parse_ref_reflinks(self, reflink_tag: Tag): 

166 value_xml = "" 

167 for link in reflink_tag.children: 

168 if not isinstance(link, Tag): 

169 continue 

170 ref_href = link.get("href") 

171 if not isinstance(ref_href, str): 

172 continue 

173 ref_href = escape(ref_href) 

174 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"): 

175 value_xml += get_ext_link_xml( 

176 ref_href, 

177 ref_href.removeprefix( 

178 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

179 ), 

180 "zbl-item-id", 

181 ) 

182 continue 

183 if ref_href.startswith("http://dx.doi.org/"): 

184 value_xml += get_ext_link_xml( 

185 ref_href, 

186 ref_href.removeprefix("http://dx.doi.org/"), 

187 "doi", 

188 ) 

189 continue 

190 value_xml += get_ext_link_xml( 

191 ref_href, 

192 escape(cleanup_str(link.text)), 

193 "uri", 

194 ) 

195 

196 return value_xml