Coverage for src / crawler / by_source / kobe_archive_crawler.py: 22%
78 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import logging
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_abstract, create_articledata
7from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
8from crawler.utils import regex_to_dict
10logger = logging.getLogger(__name__)
13class KobeArchiveCrawler(MatchingCrawler):
14 source_name = "Kobe University Repository"
15 source_domain = "KOBE_UNIVERSITY_DIGITAL_ARCHIVE"
16 source_website = "https://da.lib.kobe-u.ac.jp/da/kernel/cate_browse/?lang=1&codeno=002&schemaid=30000&catecode=002100"
18 issue_url = "https://da.lib.kobe-u.ac.jp/da/kernel/search/{issue_id}/?lang=1&cate_schema=30000&mode=0&cflg=1&codeno=002"
19 article_url = "https://da.lib.kobe-u.ac.jp/da/kernel/{article_id}/?lang=1&mode=0&opkey=R178100158613792&idx=1&chk_schema=30000&cate_schema=30000&cflg=1&codeno=002&fc_val=&chk_st=0&check=00"
20 article_url_paginate = "https://da.lib.kobe-u.ac.jp/da/kernel/search/simple/?lang=1&mode=0&opkey={opkey}&chk_schema=30000&list_sort=8&disp_cnt=500&con_kywd=&cate_schema=30000&fc_val=&req=dispchg&chk_st=20&check=00000000000000"
22 issue_category_click_re = r"categoryClick\('(?P<issue_id>\d+)'[^)]*\)"
23 article_detail_click_re = r"detailClick\('kernel', '(?P<article_id>[^']+)"
24 volume_re = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)\uff09"
25 volume_re_error = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)~(?P<year_next>\d+)\uff09"
26 opkey_re = r"opkey=(?P<opkey>[^&]+)"
28 def parse_collection_content(self, content):
29 xissues = []
30 soup = BeautifulSoup(content, "html.parser")
32 volumes_tag = soup.select("div.cate_info div.cate_child div.cate_main div.cate_name a")
33 for volume_tag in volumes_tag:
34 volume_text = volume_tag.text
35 if volume_text is None:
36 raise ValueError("Couldn't parse volume tag")
38 try:
39 volume_group = regex_to_dict(
40 self.volume_re, volume_text, error_msg="Couldn't parse volume number & year"
41 )
42 except ValueError:
43 volume_group = regex_to_dict(
44 self.volume_re_error,
45 volume_text,
46 error_msg="Couldn't parse volume number & year",
47 )
49 volume_onclick = volume_tag.get("onclick")
50 issue_id = regex.search(self.issue_category_click_re, volume_onclick).group("issue_id")
51 if not issue_id:
52 raise ValueError("Couldn't find issue_id")
54 try:
55 xissues.append(
56 self.create_xissue(
57 self.issue_url.format(issue_id=issue_id),
58 volume_group.get("year"),
59 volume_number=volume_group.get("volume"),
60 issue_number="1",
61 )
62 )
63 except TypeError:
64 print("can't create issue")
66 return xissues
68 def parse_issue_content(self, content, xissue):
69 soup = BeautifulSoup(content, "html.parser")
71 # Get 500 articles on single page if issue has pagination
72 has_pagination = soup.select_one("div.search-results-pager ul.pagination li")
73 if has_pagination:
74 opkey_tag = soup.select_one(
75 "div.search-result-left div.result-add-info p.result-book-cover a"
76 )
77 opkey_href = opkey_tag.get("href")
78 opkey = regex.search(self.opkey_re, opkey_href)
79 if not opkey:
80 raise ValueError(f"Couldn't find opkey for issue {xissue.url}")
81 opkey = opkey.group("opkey")
82 if opkey:
83 new_issue_url = self.article_url_paginate.format(opkey=opkey)
84 content = self.download_file(new_issue_url)
85 return self.parse_issue_content(content, xissue)
87 articles_tag = soup.select("ul li div.search-result-right div.search-result-table p a")
88 article_number = 0
89 for article_tag in articles_tag:
90 xarticle = create_articledata()
91 article_onclick = article_tag.get("onclick")
92 if not isinstance(article_onclick, str):
93 raise ValueError(f"Couldn't parse article url for issue {xissue.url}")
95 article_id = regex.search(self.article_detail_click_re, article_onclick)
96 if not article_id:
97 raise ValueError(f"Couldn't parse article id for issue {xissue.url}")
98 article_id = article_id.group("article_id")
100 xarticle.url = self.article_url.format(article_id=article_id)
101 xarticle.pid = "a_" + article_id
102 xissue.articles.append(xarticle)
103 article_number += 1
105 def parse_article_content(self, content, xissue, xarticle, url):
106 soup = BeautifulSoup(content, "html.parser")
108 self.get_metadata_using_citation_meta(
109 xarticle,
110 xissue,
111 soup,
112 [
113 "pdf",
114 "title",
115 "author",
116 "page",
117 "doi",
118 "publisher",
119 "lang",
120 "reference",
121 ],
122 )
123 abstract_tag = soup(text="Abstract (free license)")
124 if abstract_tag and len(abstract_tag) > 0:
125 abstract = abstract_tag[0].parent.parent.select_one("td").text
126 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract))
127 else:
128 raise ValueError(f"Couldn't parse abstracts for article {xarticle.url}")
129 return xarticle