Coverage for src/crawler/by_source/kobe_archive

1import logging

3import regex

4from bs4 import BeautifulSoup

5from ptf.model_data import create_abstract, create_articledata

7from crawler.abstract_crawlers.matching_crawler import MatchingCrawler

8from crawler.utils import regex_to_dict

10logger = logging.getLogger(__name__)

13class KobeArchiveCrawler(MatchingCrawler):

14 source_name = "Kobe University Repository"

15 source_domain = "KOBE_UNIVERSITY_DIGITAL_ARCHIVE"

16 source_website = "https://da.lib.kobe-u.ac.jp/da/kernel/cate_browse/?lang=1&codeno=002&schemaid=30000&catecode=002100"

18 issue_url = "https://da.lib.kobe-u.ac.jp/da/kernel/search/{issue_id}/?lang=1&cate_schema=30000&mode=0&cflg=1&codeno=002"

19 article_url = "https://da.lib.kobe-u.ac.jp/da/kernel/{article_id}/?lang=1&mode=0&opkey=R178100158613792&idx=1&chk_schema=30000&cate_schema=30000&cflg=1&codeno=002&fc_val=&chk_st=0&check=00"

20 article_url_paginate = "https://da.lib.kobe-u.ac.jp/da/kernel/search/simple/?lang=1&mode=0&opkey={opkey}&chk_schema=30000&list_sort=8&disp_cnt=500&con_kywd=&cate_schema=30000&fc_val=&req=dispchg&chk_st=20&check=00000000000000"

22 issue_category_click_re = r"categoryClick\('(?P<issue_id>\d+)'[^)]*\)"

23 article_detail_click_re = r"detailClick\('kernel', '(?P<article_id>[^']+)"

24 volume_re = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)\uff09"

25 volume_re_error = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)～(?P<year_next>\d+)\uff09"

26 opkey_re = r"opkey=(?P<opkey>[^&]+)"

28 def parse_collection_content(self, content):

29 xissues = []

30 soup = BeautifulSoup(content, "html.parser")

32 volumes_tag = soup.select("div.cate_info div.cate_child div.cate_main div.cate_name a")

33 for volume_tag in volumes_tag:

34 volume_text = volume_tag.text

35 if volume_text is None:

36 raise ValueError("Couldn't parse volume tag")

38 try:

39 volume_group = regex_to_dict(

40 self.volume_re, volume_text, error_msg="Couldn't parse volume number & year"

41 )

42 except ValueError:

43 volume_group = regex_to_dict(

44 self.volume_re_error,

45 volume_text,

46 error_msg="Couldn't parse volume number & year",

47 )

49 volume_onclick = volume_tag.get("onclick")

50 issue_id = regex.search(self.issue_category_click_re, volume_onclick).group("issue_id")

51 if not issue_id:

52 raise ValueError("Couldn't find issue_id")

54 try:

55 xissues.append(

56 self.create_xissue(

57 self.issue_url.format(issue_id=issue_id),

58 volume_group.get("year"),

59 volume_number=volume_group.get("volume"),

60 issue_number="1",

61 )

62 )

63 except TypeError:

64 print("can't create issue")

66 return xissues

68 def parse_issue_content(self, content, xissue):

69 soup = BeautifulSoup(content, "html.parser")

71 # Get 500 articles on single page if issue has pagination

72 has_pagination = soup.select_one("div.search-results-pager ul.pagination li")

73 if has_pagination:

74 opkey_tag = soup.select_one(

75 "div.search-result-left div.result-add-info p.result-book-cover a"

76 )

77 opkey_href = opkey_tag.get("href")

78 opkey = regex.search(self.opkey_re, opkey_href)

79 if not opkey:

80 raise ValueError(f"Couldn't find opkey for issue {xissue.url}")

81 opkey = opkey.group("opkey")

82 if opkey:

83 new_issue_url = self.article_url_paginate.format(opkey=opkey)

84 content = self.download_file(new_issue_url)

85 return self.parse_issue_content(content, xissue)

87 articles_tag = soup.select("ul li div.search-result-right div.search-result-table p a")

88 article_number = 0

89 for article_tag in articles_tag:

90 xarticle = create_articledata()

91 article_onclick = article_tag.get("onclick")

92 if not isinstance(article_onclick, str):

93 raise ValueError(f"Couldn't parse article url for issue {xissue.url}")

95 article_id = regex.search(self.article_detail_click_re, article_onclick)

96 if not article_id:

97 raise ValueError(f"Couldn't parse article id for issue {xissue.url}")

98 article_id = article_id.group("article_id")

100 xarticle.url = self.article_url.format(article_id=article_id)

101 xarticle.pid = "a_" + article_id

102 xissue.articles.append(xarticle)

103 article_number += 1

104

105 def parse_article_content(self, content, xissue, xarticle, url):

106 soup = BeautifulSoup(content, "html.parser")

107

108 self.get_metadata_using_citation_meta(

109 xarticle,

110 xissue,

111 soup,

112 [

113 "pdf",

114 "title",

115 "author",

116 "page",

117 "doi",

118 "publisher",

119 "lang",

120 "reference",

121 ],

122 )

123 abstract_tag = soup(text="Abstract (free license)")

124 if abstract_tag and len(abstract_tag) > 0:

125 abstract = abstract_tag[0].parent.parent.select_one("td").text

126 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract))

127 else:

128 raise ValueError(f"Couldn't parse abstracts for article {xarticle.url}")

129 return xarticle

Coverage for src / crawler / by_source / kobe_archive_crawler.py: 22%

78 statements