Coverage for src / crawler / by_source / j_stage_crawler.py: 26%

67 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import logging 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_abstract, create_articledata 

5 

6from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

7from crawler.utils import regex_to_dict 

8 

9logger = logging.getLogger(__name__) 

10 

11 

12class JStageCrawler(BaseCollectionCrawler): 

13 source_name = "Funkcialaj Ekvacioj" 

14 source_domain = "J_STAGE" 

15 source_website = "https://www.jstage.jst.go.jp/browse/fesi/list/-char/en" 

16 

17 issue_url = "https://da.lib.kobe-u.ac.jp/da/kernel/search/{issue_id}/?lang=1&cate_schema=30000&mode=0&cflg=1&codeno=002" 

18 article_url = "https://da.lib.kobe-u.ac.jp/da/kernel/{article_id}/?lang=1&mode=0&opkey=R178100158613792&idx=1&chk_schema=30000&cate_schema=30000&cflg=1&codeno=002&fc_val=&chk_st=0&check=00" 

19 article_url_paginate = "https://da.lib.kobe-u.ac.jp/da/kernel/search/simple/?lang=1&mode=0&opkey={opkey}&chk_schema=30000&list_sort=8&disp_cnt=500&con_kywd=&cate_schema=30000&fc_val=&req=dispchg&chk_st=20&check=00000000000000" 

20 

21 issue_category_click_re = r"categoryClick\('(?P<issue_id>\d+)'[^)]*\)" 

22 article_detail_click_re = r"detailClick\('kernel', '(?P<article_id>[^']+)" 

23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

24 issue_re = r"Issue (?P<issue>\d+) Pages (?P<page>[^-]+)" 

25 volume_re_error = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)~(?P<year_next>\d+)\uff09" 

26 opkey_re = r"opkey=(?P<opkey>[^&]+)" 

27 

28 def parse_collection_content(self, content): 

29 xissues = [] 

30 soup = BeautifulSoup(content, "html.parser") 

31 volumes_tag = soup.select( 

32 "div.facetsearch-content-wrap div.facetsearch-subheader div.facetsearch-content-innerwrap" 

33 ) 

34 for volume_tag in volumes_tag: 

35 volume_title_tag = volume_tag.parent.select_one("span.facetsearch-subheader-link a") 

36 if not volume_title_tag: 

37 continue 

38 volume_text = volume_title_tag.text 

39 if volume_text is None: 

40 raise ValueError("Couldn't parse volume tag") 

41 volume_group = regex_to_dict( 

42 self.volume_re, volume_text, error_msg="Couldn't parse volume number & year" 

43 ) 

44 

45 issues_tag = volume_tag.select("ul li a") 

46 for issue_tag in issues_tag: 

47 issue_url = issue_tag.get("href") 

48 if not issue_url: 

49 raise ValueError("Couldn't find issue url") 

50 

51 issue_text = issue_tag.text 

52 if not issue_text: 

53 raise ValueError("Couldn't parse issue number") 

54 try: 

55 issue_group = regex_to_dict( 

56 self.issue_re, issue_text, error_msg="Couldn't parse volume number & year" 

57 ) 

58 except ValueError: 

59 raise ValueError("Couldn't parse issue number") 

60 

61 try: 

62 xissues.append( 

63 self.create_xissue( 

64 issue_url, 

65 volume_group.get("year"), 

66 volume_number=volume_group.get("volume"), 

67 issue_number=str(issue_group.get("issue")), 

68 ) 

69 ) 

70 except TypeError: 

71 print("can't create issue") 

72 return xissues 

73 

74 def parse_issue_content(self, content, xissue): 

75 soup = BeautifulSoup(content, "html.parser") 

76 # On 12/06/2026 no managing of pagination required 

77 articles_tag = soup.select("ul.search-resultslisting li") 

78 for article_tag in articles_tag: 

79 xarticle = create_articledata() 

80 article_url_tag = article_tag.select_one("div.searchlist-title a") 

81 if not article_url_tag: 

82 raise ValueError(f"Couldn't parse article url for issue {xissue.url}") 

83 article_url = article_url_tag.get("href") 

84 

85 xarticle.url = article_url 

86 xissue.articles.append(xarticle) 

87 

88 def parse_article_content(self, content, xissue, xarticle, url): 

89 soup = BeautifulSoup(content, "html.parser") 

90 

91 self.get_metadata_using_citation_meta( 

92 xarticle, 

93 xissue, 

94 soup, 

95 [ 

96 "pdf", 

97 "title", 

98 "author", 

99 "doi", 

100 "publisher", 

101 "lang", 

102 "keywords", 

103 "reference", 

104 ], 

105 ) 

106 

107 abstract_tag = soup(text="Abstract") 

108 if abstract_tag and len(abstract_tag) > 0: 

109 abstract = abstract_tag[1].parent.parent.select_one("div.global-para-14").text 

110 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract)) 

111 return xarticle