Coverage for src/crawler/by_source/dmlbul_crawler.py: 96%

81 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import json 

2 

3import regex 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs) 

18# Handle this by creating another article as an errata https://www.numdam.org/item/SEDP_1971-1972____A1_0/ 

19class DmlbulCrawler(BaseCollectionCrawler): 

20 source_name = "Bulgarian Digital Mathematics Library" 

21 source_domain = "DMLBUL" 

22 source_website = "https://buldml.math.bas.bg" 

23 

24 # Regexes to detect issue metadata, from most specific to least 

25 issue_regexes: list[str] = [ 

26 # MEM 

27 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?", 

28 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.", 

29 # SMJ2 

30 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)", 

31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).", 

32 ] 

33 

34 def parse_collection_content(self, content): 

35 """ 

36 Parse the HTML page of Annals of Math and returns a list of xissue. 

37 Each xissue has its pid/volume/number/year metadata + its url 

38 """ 

39 data = json.loads(content) 

40 id = data["_id"] 

41 articles = self.query_article_list(id) 

42 issues = self.extract_issue_list(articles) 

43 return issues 

44 

45 def query_article_list(self, id: str): 

46 articles_data = [] 

47 count = float("inf") 

48 while len(articles_data) < count: 

49 request = self.session.post( 

50 "https://buldml.math.bas.bg/api/object/search", 

51 json={ 

52 "search": [ 

53 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}} 

54 ], 

55 "skip": len(articles_data), 

56 "limit": min(200, count - len(articles_data)), 

57 "add": {"identifier": "$identifier", "_alias": "$_alias"}, 

58 "sort": {"issue-date.year": 1, "_id": 1}, 

59 }, 

60 ) 

61 str_data = self.decode_response(request) 

62 data = json.loads(str_data) 

63 count = data["count"] 

64 articles_data.extend(data["data"]) 

65 return articles_data 

66 

67 def extract_issue_list(self, article_list: list[dict]): 

68 issues: dict[str, IssueData] = {} 

69 for _a in article_list: 

70 article = create_articledata() 

71 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"]) 

72 

73 issue_re = None 

74 re_index = 0 

75 MAX_RETRIES = len(self.issue_regexes) 

76 while issue_re is None and re_index < MAX_RETRIES: 

77 issue_re = regex.search(self.issue_regexes[re_index], issue_str) 

78 re_index += 1 

79 

80 if not issue_re: 

81 self.logger.error( 

82 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed", 

83 extra={"input": _a}, 

84 ) 

85 continue 

86 issue_dict = issue_re.groupdict() 

87 

88 if issue_dict.get("pagestart", None): 

89 article.fpage = issue_dict["pagestart"] 

90 if issue_dict.get("pageend", None): 

91 article.lpage = issue_dict["pageend"] 

92 

93 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"] 

94 

95 pid = self.get_issue_pid( 

96 self.collection_id, 

97 issue_dict["year"], 

98 issue_dict["volume"], 

99 issue_dict.get("number", None), 

100 ) 

101 if pid in issues: 

102 issue = issues[pid] 

103 else: 

104 issue = self.create_xissue( 

105 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

106 ) 

107 issues[pid] = issue 

108 

109 article.pid = "a" + str(len(issues[pid].articles)) 

110 issue.articles.append(article) 

111 return list(issues.values()) 

112 

113 def parse_article_content( 

114 self, 

115 content: str, 

116 xissue: IssueData, 

117 xarticle: ArticleData, 

118 url: str, 

119 ): 

120 data = json.loads(content) 

121 # title 

122 xarticle.title_tex = data["title"]["_v"] 

123 # lang 

124 xarticle.lang = data["language"]["_v"] 

125 if xarticle.lang == "other": 

126 xarticle.lang = "und" 

127 # authors 

128 for _a in data["authors"]["_v"]: 

129 author = create_contributor(string_name=_a, role="author") 

130 xarticle.contributors.append(author) 

131 # abstracts 

132 if "abstract" in data: 

133 xarticle.abstracts.append( 

134 create_abstract( 

135 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract" 

136 ) 

137 ) 

138 # keywords 

139 if "subject" in data: 139 ↛ 144line 139 didn't jump to line 144 because the condition on line 139 was always true

140 for _k in data["subject"]["_v"]: 

141 keyword = create_subj(lang=xarticle.lang, value=_k) 

142 xarticle.kwds.append(keyword) 

143 # pdf 

144 if not xarticle.url: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 raise ValueError( 

146 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url" 

147 ) 

148 if "uri" in data["identifier"]: 

149 url = data["identifier"]["uri"]["_v"] 

150 else: 

151 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"] 

152 self.logger.info( 

153 f"Article URL not found, falling back with handle.net. Please check link validity : {url}", 

154 extra={"pid": xarticle.pid}, 

155 ) 

156 xarticle.url = url 

157 if len(data["media"]) == 0: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found") 

159 for media in data["media"]: 

160 add_pdf_link_to_xarticle( 

161 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"] 

162 ) 

163 return xarticle