Coverage for src / crawler / by_source / kobe_archive_crawler.py: 22%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import logging 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_abstract, create_articledata 

6 

7from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

8from crawler.utils import regex_to_dict 

9 

10logger = logging.getLogger(__name__) 

11 

12 

13class KobeArchiveCrawler(MatchingCrawler): 

14 source_name = "Kobe University Repository" 

15 source_domain = "KOBE_UNIVERSITY_DIGITAL_ARCHIVE" 

16 source_website = "https://da.lib.kobe-u.ac.jp/da/kernel/cate_browse/?lang=1&codeno=002&schemaid=30000&catecode=002100" 

17 

18 issue_url = "https://da.lib.kobe-u.ac.jp/da/kernel/search/{issue_id}/?lang=1&cate_schema=30000&mode=0&cflg=1&codeno=002" 

19 article_url = "https://da.lib.kobe-u.ac.jp/da/kernel/{article_id}/?lang=1&mode=0&opkey=R178100158613792&idx=1&chk_schema=30000&cate_schema=30000&cflg=1&codeno=002&fc_val=&chk_st=0&check=00" 

20 article_url_paginate = "https://da.lib.kobe-u.ac.jp/da/kernel/search/simple/?lang=1&mode=0&opkey={opkey}&chk_schema=30000&list_sort=8&disp_cnt=500&con_kywd=&cate_schema=30000&fc_val=&req=dispchg&chk_st=20&check=00000000000000" 

21 

22 issue_category_click_re = r"categoryClick\('(?P<issue_id>\d+)'[^)]*\)" 

23 article_detail_click_re = r"detailClick\('kernel', '(?P<article_id>[^']+)" 

24 volume_re = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)\uff09" 

25 volume_re_error = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)~(?P<year_next>\d+)\uff09" 

26 opkey_re = r"opkey=(?P<opkey>[^&]+)" 

27 

28 def parse_collection_content(self, content): 

29 xissues = [] 

30 soup = BeautifulSoup(content, "html.parser") 

31 

32 volumes_tag = soup.select("div.cate_info div.cate_child div.cate_main div.cate_name a") 

33 for volume_tag in volumes_tag: 

34 volume_text = volume_tag.text 

35 if volume_text is None: 

36 raise ValueError("Couldn't parse volume tag") 

37 

38 try: 

39 volume_group = regex_to_dict( 

40 self.volume_re, volume_text, error_msg="Couldn't parse volume number & year" 

41 ) 

42 except ValueError: 

43 volume_group = regex_to_dict( 

44 self.volume_re_error, 

45 volume_text, 

46 error_msg="Couldn't parse volume number & year", 

47 ) 

48 

49 volume_onclick = volume_tag.get("onclick") 

50 issue_id = regex.search(self.issue_category_click_re, volume_onclick).group("issue_id") 

51 if not issue_id: 

52 raise ValueError("Couldn't find issue_id") 

53 

54 try: 

55 xissues.append( 

56 self.create_xissue( 

57 self.issue_url.format(issue_id=issue_id), 

58 volume_group.get("year"), 

59 volume_number=volume_group.get("volume"), 

60 issue_number="1", 

61 ) 

62 ) 

63 except TypeError: 

64 print("can't create issue") 

65 

66 return xissues 

67 

68 def parse_issue_content(self, content, xissue): 

69 soup = BeautifulSoup(content, "html.parser") 

70 

71 # Get 500 articles on single page if issue has pagination 

72 has_pagination = soup.select_one("div.search-results-pager ul.pagination li") 

73 if has_pagination: 

74 opkey_tag = soup.select_one( 

75 "div.search-result-left div.result-add-info p.result-book-cover a" 

76 ) 

77 opkey_href = opkey_tag.get("href") 

78 opkey = regex.search(self.opkey_re, opkey_href) 

79 if not opkey: 

80 raise ValueError(f"Couldn't find opkey for issue {xissue.url}") 

81 opkey = opkey.group("opkey") 

82 if opkey: 

83 new_issue_url = self.article_url_paginate.format(opkey=opkey) 

84 content = self.download_file(new_issue_url) 

85 return self.parse_issue_content(content, xissue) 

86 

87 articles_tag = soup.select("ul li div.search-result-right div.search-result-table p a") 

88 article_number = 0 

89 for article_tag in articles_tag: 

90 xarticle = create_articledata() 

91 article_onclick = article_tag.get("onclick") 

92 if not isinstance(article_onclick, str): 

93 raise ValueError(f"Couldn't parse article url for issue {xissue.url}") 

94 

95 article_id = regex.search(self.article_detail_click_re, article_onclick) 

96 if not article_id: 

97 raise ValueError(f"Couldn't parse article id for issue {xissue.url}") 

98 article_id = article_id.group("article_id") 

99 

100 xarticle.url = self.article_url.format(article_id=article_id) 

101 xarticle.pid = "a_" + article_id 

102 xissue.articles.append(xarticle) 

103 article_number += 1 

104 

105 def parse_article_content(self, content, xissue, xarticle, url): 

106 soup = BeautifulSoup(content, "html.parser") 

107 

108 self.get_metadata_using_citation_meta( 

109 xarticle, 

110 xissue, 

111 soup, 

112 [ 

113 "pdf", 

114 "title", 

115 "author", 

116 "page", 

117 "doi", 

118 "publisher", 

119 "lang", 

120 "reference", 

121 ], 

122 ) 

123 abstract_tag = soup(text="Abstract (free license)") 

124 if abstract_tag and len(abstract_tag) > 0: 

125 abstract = abstract_tag[0].parent.parent.select_one("td").text 

126 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract)) 

127 else: 

128 raise ValueError(f"Couldn't parse abstracts for article {xarticle.url}") 

129 return xarticle