Coverage for src/crawler/by_source/dmlbul_crawler.py: 94%

83 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import json 

2 

3import regex 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17class DmlbulCrawler(BaseCollectionCrawler): 

18 source_name = "Bulgarian Digital Mathematics Library" 

19 source_domain = "DMLBUL" 

20 source_website = "https://buldml.math.bas.bg" 

21 

22 periode_begin = 0 

23 periode_end = 9999 

24 

25 # Regexes to detect issue metadata, from most specific to least 

26 issue_regexes: list[str] = [ 

27 # MEM 

28 r"[\w ]+, Vol\. (?P<volume>\d+), No (?P<number>[\d\-\w]+), \((?P<year>\d+)\)(?:, +(?P<pagestart>\d+)p-(?P<pageend>\d+)p)?", 

29 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?), (?P<pagestart>\d+)(?: ?[\-−–] ?(?P<pageend>\d+))?\.", 

30 # SMJ2 

31 r"Volume (?P<volume>\d+), Number (?P<number>[\d\w\-]+), (?P<year>\d+), pp. (?P<pagestart>\d+)[\-−](?P<pageend>\d+)", 

32 r"\((?P<year>\d+)\).+, (?:(?P<volume>\d+)(?:\((?P<number>[\d\w\-−–]+)\))?).", 

33 ] 

34 

35 def parse_collection_content(self, content): 

36 """ 

37 Parse the HTML page of Annals of Math and returns a list of xissue. 

38 Each xissue has its pid/volume/number/year metadata + its url 

39 

40 self.periode is set at the end based on the xissue years of the HTML page 

41 """ 

42 data = json.loads(content) 

43 id = data["_id"] 

44 articles = self.query_article_list(id) 

45 issues = self.extract_issue_list(articles) 

46 return issues 

47 

48 def query_article_list(self, id: str): 

49 articles_data = [] 

50 count = float("inf") 

51 while len(articles_data) < count: 

52 request = self.session.post( 

53 "https://buldml.math.bas.bg/api/object/search", 

54 json={ 

55 "search": [ 

56 {"match": {"_t": "ct:Item", "category._rr": id, "*oid": ["category._rr"]}} 

57 ], 

58 "skip": len(articles_data), 

59 "limit": min(200, count - len(articles_data)), 

60 "add": {"identifier": "$identifier", "_alias": "$_alias"}, 

61 "sort": {"issue-date.year": 1, "_id": 1}, 

62 }, 

63 ) 

64 str_data = self.decode_response(request) 

65 data = json.loads(str_data) 

66 count = data["count"] 

67 articles_data.extend(data["data"]) 

68 return articles_data 

69 

70 def extract_issue_list(self, article_list: list[dict]): 

71 issues: dict[str, IssueData] = {} 

72 for _a in article_list: 

73 article = create_articledata() 

74 issue_str = cleanup_str(_a["identifier"]["citation"]["_v"]) 

75 

76 issue_re = None 

77 re_index = 0 

78 MAX_RETRIES = len(self.issue_regexes) 

79 while issue_re is None and re_index < MAX_RETRIES: 

80 issue_re = regex.search(self.issue_regexes[re_index], issue_str) 

81 re_index += 1 

82 

83 if not issue_re: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 raise ValueError( 

85 f"[{self.source_domain}] {self.collection_id} : Article cannot be parsed" 

86 ) 

87 issue_dict = issue_re.groupdict() 

88 

89 if issue_dict.get("pagestart", None): 

90 article.fpage = issue_dict["pagestart"] 

91 if issue_dict.get("pageend", None): 

92 article.lpage = issue_dict["pageend"] 

93 

94 article.url = "https://buldml.math.bas.bg/api/object/a/" + _a["_alias"]["_v"] 

95 

96 pid = self.get_issue_pid( 

97 self.collection_id, 

98 issue_dict["year"], 

99 issue_dict["volume"], 

100 issue_dict.get("number", None), 

101 ) 

102 if pid in issues: 

103 issue = issues[pid] 

104 else: 

105 issue = self.create_xissue( 

106 None, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

107 ) 

108 issues[pid] = issue 

109 

110 article.pid = "a" + str(len(issues[pid].articles)) 

111 issue.articles.append(article) 

112 return list(issues.values()) 

113 

114 def parse_article_content( 

115 self, 

116 content: str, 

117 xissue: IssueData, 

118 xarticle: ArticleData, 

119 url: str, 

120 pid: str, 

121 ): 

122 data = json.loads(content) 

123 xarticle.pid = pid 

124 # title 

125 xarticle.title_tex = data["title"]["_v"] 

126 # lang 

127 xarticle.lang = data["language"]["_v"] 

128 if xarticle.lang == "other": 

129 xarticle.lang = "und" 

130 # authors 

131 for _a in data["authors"]["_v"]: 

132 author = create_contributor(string_name=_a, role="author") 

133 xarticle.contributors.append(author) 

134 # abstracts 

135 if "abstract" in data: 

136 xarticle.abstracts.append( 

137 create_abstract( 

138 lang=xarticle.lang, value_tex=data["abstract"]["_v"], tag="abstract" 

139 ) 

140 ) 

141 # keywords 

142 if "subject" in data: 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true

143 for _k in data["subject"]["_v"]: 

144 keyword = create_subj(lang=xarticle.lang, value=_k) 

145 xarticle.kwds.append(keyword) 

146 # pdf 

147 if not xarticle.url: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 raise ValueError( 

149 f"[{self.source_domain}] {self.collection_id} : Article doesn't have an url" 

150 ) 

151 if "uri" in data["identifier"]: 

152 url = data["identifier"]["uri"]["_v"] 

153 else: 

154 url = "https://hdl.handle.net/10525/" + data["_alias"]["_v"] 

155 print( 

156 f"[{self.source_domain}] {xarticle.pid} : Article URL not found, falling back with handle.net. Please check link validity : {url}" 

157 ) 

158 xarticle.url = url 

159 if len(data["media"]) == 0: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 raise ValueError(f"[{self.source_domain}] {self.collection_id} : PDF Not found") 

161 for media in data["media"]: 

162 add_pdf_link_to_xarticle( 

163 xarticle, "https://buldml.math.bas.bg/asset/default/" + media["file"]["_v"] 

164 ) 

165 return xarticle