Coverage for src/crawler/by_source/emis_aaa_crawler.py: 11%

119 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml 

6from ptf.cmds.xml.jats.jats_parser import JatsBase 

7from ptf.cmds.xml.xml_utils import escape 

8from ptf.model_data import ( 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

17 

18 

19class Emis_aaaCrawler(BaseCollectionCrawler): 

20 source_name = "European Mathematical Information Service" 

21 source_domain = "EMIS_AAA" 

22 source_website = "https://www.emis.de" 

23 

24 issue_re = regex.compile( 

25 r"Volume (?P<volume>\d+) \((?P<year>\d+)\)(?:, Issue (?P<number>\d+))?" 

26 ) 

27 doi_re = regex.compile(r"(?:doi:|http:\/\/dx\.doi\.org\/)(?P<doi>10\.[\w\.\/]+\/[\w\.\/]+)") 

28 

29 def parse_collection_content(self, content): 

30 xissues = [] 

31 

32 soup = BeautifulSoup(content, "html5lib") 

33 a_tags = soup.select("a[href^='Volume']") 

34 for a_tag in a_tags: 

35 href = a_tag.get("href") 

36 if not isinstance(href, str): 

37 raise ValueError("Couldn't parse ") 

38 xissues.extend(self.prefetch_aaa_issues(urljoin(self.collection_url, href))) 

39 return xissues 

40 

41 # Volumes and issues are defined inside the issue pages 

42 # No choice but to fetch everything in parse_collection_content 

43 def prefetch_aaa_issues(self, link: str): 

44 """ 

45 Parses one AAA Volume page and returns an iterable of issues 

46 """ 

47 xissues_dict: dict[str, IssueData] = {} 

48 content = self.download_file(link) 

49 soup = BeautifulSoup(content, "html5lib") 

50 article_tags = soup.select("#ctl00_ContentPlaceHolder1_LblArticles li") 

51 for index, a_tag in enumerate(article_tags): 

52 href = a_tag.select_one("a").get("href") 

53 if not isinstance(href, str): 

54 raise ValueError("Couldn't parse article link") 

55 

56 xarticle = create_articledata() 

57 xarticle.pid = "a" + str(index) 

58 xarticle.url = urljoin(link, href) 

59 

60 issue_search = self.issue_re.search(cleanup_str(a_tag.text)) 

61 if not issue_search: 

62 raise ValueError("Couldn't parse issue data") 

63 issue_data = issue_search.groupdict() 

64 

65 issue_pid = self.get_issue_pid( 

66 self.collection_id, 

67 issue_data["year"], 

68 issue_data["volume"], 

69 issue_data.get("number", None), 

70 ) 

71 

72 if issue_pid not in xissues_dict: 

73 xissues_dict[issue_pid] = self.create_xissue( 

74 link, 

75 year=issue_data["year"], 

76 volume_number=issue_data["volume"], 

77 issue_number=issue_data.get("number", None), 

78 ) 

79 

80 xissues_dict[issue_pid].articles.append(xarticle) 

81 return xissues_dict.values() 

82 

83 def parse_article_content(self, content, xissue, xarticle, url): 

84 soup = BeautifulSoup(content, "html5lib") 

85 

86 title_tag = soup.select_one(".middle_content h2") 

87 xarticle.title_tex = cleanup_str(title_tag.text) 

88 

89 authors_tag = soup.select_one(".middle_content h1, .middle_content .author_gp") 

90 

91 if authors_tag: 

92 # Remove affiliations from author links 

93 # RFE : parse author affiliations 

94 sup_tags = authors_tag.select("sup") 

95 for sup in sup_tags: 

96 sup.decompose() 

97 del sup_tags 

98 

99 authors_str = cleanup_str(authors_tag.text) 

100 authors_str.replace(", and ", ", ") 

101 for author in authors_str.split(", "): 

102 if cleanup_str(author) == "": 

103 raise ValueError("Invalid author") 

104 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

105 

106 doi_tag = soup.select_one(".middle_content pre") 

107 if doi_tag: 

108 doi_search = self.doi_re.search(doi_tag.text) 

109 if doi_search: 

110 doi = doi_search.group("doi") 

111 xarticle.doi = doi 

112 

113 abstract_header = soup.select_one("h4:-soup-contains-own('Abstract')") 

114 if abstract_header: 

115 abstract_tag = abstract_header.parent.select_one("p") 

116 if abstract_tag: 

117 xarticle.abstracts.append( 

118 create_abstract( 

119 tag="abstract", 

120 lang=xarticle.lang, 

121 value_tex=escape(cleanup_str(abstract_tag.text)), 

122 ) 

123 ) 

124 

125 references_header = soup.select_one("h4:-soup-contains-own('References')") 

126 if references_header: 

127 references_tags = references_header.parent.select("ol > li") 

128 bibitems = [] 

129 for ref_tag in references_tags: 

130 bibitems.append(self.parse_ref(ref_tag)) 

131 if len(bibitems) > 0: 

132 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

133 

134 pdf_tag = soup.select_one("a.full_text_pdf") 

135 

136 pdf_href = pdf_tag.get("href") 

137 if not isinstance(pdf_href, str): 

138 raise ValueError("Couldn't parse pdf href") 

139 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_href)) 

140 return xarticle 

141 

142 def parse_ref(self, tag: Tag): 

143 value_xml = "" 

144 for el in tag.children: 

145 if isinstance(el, Tag): 

146 if el.name == "i": 

147 value_xml += get_article_title_xml(cleanup_str(el.text)) 

148 continue 

149 

150 if "reflinks" in (el.get("class", []) or []): 

151 value_xml += self.parse_ref_reflinks(el) 

152 continue 

153 continue 

154 

155 if isinstance(el, str): 

156 value_xml += el 

157 continue 

158 

159 return JatsBase.bake_ref(value_xml) 

160 

161 def parse_ref_reflinks(self, reflink_tag: Tag): 

162 value_xml = "" 

163 for link in reflink_tag.children: 

164 if not isinstance(link, Tag): 

165 continue 

166 ref_href = link.get("href") 

167 if not isinstance(ref_href, str): 

168 continue 

169 ref_href = escape(ref_href) 

170 if ref_href.startswith("http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:"): 

171 value_xml += get_ext_link_xml( 

172 ref_href, 

173 ref_href.removeprefix( 

174 "http://www.zentralblatt-math.org/zmath/en/advanced/?q=an:" 

175 ), 

176 "zbl-item-id", 

177 ) 

178 continue 

179 if ref_href.startswith("http://dx.doi.org/"): 

180 value_xml += get_ext_link_xml( 

181 ref_href, 

182 ref_href.removeprefix("http://dx.doi.org/"), 

183 "doi", 

184 ) 

185 continue 

186 value_xml += get_ext_link_xml( 

187 ref_href, 

188 escape(cleanup_str(link.text)), 

189 "uri", 

190 ) 

191 

192 return value_xml