Coverage for src/crawler/by_source/dmlbul_crawler.py: 96%

81 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1import json 

2 

3import regex 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17# TODO : PID FCAA_2006_9_2_a5 : errata on the same page as article (2 pdfs) 

18# Handle this by creating another article as an errata https://www.numdam.org/item/SEDP_1971-1972____A1_0/ 

19class DmlbulCrawler(BaseCollectionCrawler): 

20 source_name = "Bulgarian Digital Mathematics Library" 

21 source_domain = "DMLBUL" 

22 source_website = "https://buldml.math.bas.bg" 

23 

24 # Regexes to detect issue metadata, from most specific to least 

25 issue_regexes: list[str] = [ 

26 # MEM 

27 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?", 

28 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.", 

29 # SMJ2 

30 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)", 

31 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).", 

32 ] 

33 

34 def parse_collection_content(self, content): 

35 """ 

36 Parse the HTML page of Annals of Math and returns a list of xissue. 

37 Each xissue has its pid/volume/number/year metadata + its url 

38 """ 

39 data = json.loads(content) 

40 id = data["_id"] 

41 articles = self.query_article_list(id) 

42 issues = self.extract_issue_list(articles) 

43 return issues 

44 

45 def query_article_list(self, id: str): 

46 articles_data = [] 

47 count = float("inf") 

48 while len(articles_data) < count: 

49 request = self.session.post( 

50 "https://buldml.math.bas.bg/api/object/search", 

51 json={ 

52 "search": [ 

53 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}} 

54 ], 

55 "skip": len(articles_data), 

56 "limit": min(200, count - len(articles_data)), 

57 "add": {"identifier": "$identifier", "_alias": "$_alias"}, 

58 "sort": {"issue-date.year": 1, "_id": 1}, 

59 }, 

60 ) 

61 str_data = self.decode_response(request) 

62 data = json.loads(str_data) 

63 count = data["count"] 

64 articles_data.extend(data["data"]) 

65 return articles_data 

66 

67 def extract_issue_list(self, article_list: list[dict]): 

68 issues: dict[str, IssueData] = {} 

69 for _a in article_list: 

70 article = create_articledata() 

71 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"]) 

72 

73 issue_re = None 

74 re_index = 0 

75 MAX_RETRIES = len(self.issue_regexes) 

76 while issue_re is None and re_index < MAX_RETRIES: 

77 issue_re = regex.search(self.issue_regexes[re_index], issue_str) 

78 re_index += 1 

79 

80 if not issue_re: 

81 self.logger.error( 

82 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed", 

83 extra={"input": _a}, 

84 ) 

85 continue 

86 issue_dict = issue_re.groupdict() 

87 

88 if issue_dict.get("pagestart", None): 

89 article.fpage = issue_dict["pagestart"] 

90 if issue_dict.get("pageend", None): 

91 article.lpage = issue_dict["pageend"] 

92 

93 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"] 

94 

95 pid = self.get_issue_pid( 

96 self.collection_id, 

97 issue_dict["year"], 

98 issue_dict["volume"], 

99 issue_dict.get("number", None), 

100 ) 

101 if pid in issues: 

102 issue = issues[pid] 

103 else: 

104 issue = self.create_xissue( 

105 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

106 ) 

107 issues[pid] = issue 

108 

109 article.pid = "a" + str(len(issues[pid].articles)) 

110 issue.articles.append(article) 

111 return list(issues.values()) 

112 

113 def parse_article_content( 

114 self, 

115 content: str, 

116 xissue: IssueData, 

117 xarticle: ArticleData, 

118 url: str, 

119 ): 

120 data = json.loads(content) 

121 # title 

122 xarticle.title_tex = data["title"]["_v"] 

123 # lang 

124 xarticle.lang = data["language"]["_v"] 

125 if xarticle.lang == "other": 

126 xarticle.lang = "und" 

127 # authors 

128 for _a in data["authors"]["_v"]: 

129 author = create_contributor(string_name=_a, role="author") 

130 xarticle.contributors.append(author) 

131 # abstracts 

132 if "abstract" in data: 

133 xarticle.abstracts.append( 

134 create_abstract(lang=xarticle.lang, value_tex=data["abstract"]["_v"]) 

135 ) 

136 # keywords 

137 if "subject" in data: 137 ↛ 142line 137 didn't jump to line 142 because the condition on line 137 was always true

138 for _k in data["subject"]["_v"]: 

139 keyword = create_subj(lang=xarticle.lang, value=_k) 

140 xarticle.kwds.append(keyword) 

141 # pdf 

142 if not xarticle.url: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 raise ValueError( 

144 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url" 

145 ) 

146 if "uri" in data["identifier"]: 

147 url = data["identifier"]["uri"]["_v"] 

148 else: 

149 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"] 

150 self.logger.info( 

151 f"Article URL not found, falling back with handle.net. Please check link validity : {url}", 

152 extra={"pid": xarticle.pid}, 

153 ) 

154 xarticle.url = url 

155 if len(data["media"]) == 0: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found") 

157 for media in data["media"]: 

158 add_pdf_link_to_xarticle( 

159 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"] 

160 ) 

161 return xarticle