Coverage for src/crawler/by_source/ipb_crawler.py: 95%

50 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import json 

2from urllib.parse import parse_qs, urlencode, urljoin, urlparse 

3 

4from ptf.model_data import ( 

5 IssueData, 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_extlink, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14 

15 

16class IpbCrawler(BaseCollectionCrawler): 

17 source_name = "International Press of Boston" 

18 source_domain = "IPB" 

19 source_website = "https://link.intlpress.com/" 

20 

21 def parse_collection_content(self, content): 

22 collection_urlparsed = urlparse(self.collection_url) 

23 collection_query = parse_qs(collection_urlparsed.query) 

24 

25 xissues = [] 

26 data = json.loads(content) 

27 for volume in data["data"]["volumes"]: 

28 for issue in volume["issues"]: 

29 issue_query = {"periodicalId": collection_query["id"][0], "issueId": issue["id"]} 

30 issue_url = ( 

31 "https://link.intlpress.com/api/bgcloud-front/periodical/issue-content-list?" 

32 + urlencode(issue_query) 

33 ) 

34 

35 xissues.append( 

36 self.create_xissue( 

37 issue_url, 

38 str(volume["fYear"]), 

39 str(volume["volumeNum"]), 

40 str(issue["issueNum"]), 

41 ) 

42 ) 

43 return xissues 

44 

45 def parse_issue_content(self, content, xissue): 

46 data = json.loads(content) 

47 for index, article_dict in enumerate(data["data"]["content"]): 

48 xissue.articles.append(self.parse_ipb_article(article_dict, xissue, index)) 

49 

50 def parse_ipb_article(self, article_dict: dict, xissue: IssueData, index: int): 

51 if not xissue.pid: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("You must set xissue.pid before parsing an article") 

53 if not xissue.url: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError("You must set xissue.url before parsing an article") 

55 

56 xarticle = create_articledata() 

57 xarticle.lang = "en" 

58 xarticle.pid = xissue.pid + "_a" + str(index) 

59 xarticle.doi = article_dict["doi"].removeprefix("https://dx.doi.org/") 

60 if xarticle.doi == "": 

61 xarticle.doi = None 

62 

63 ext_link = create_extlink( 

64 rel="source", 

65 location=urljoin("https://link.intlpress.com/JDetail/", article_dict["id"]), 

66 metadata=self.source_domain, 

67 ) 

68 xarticle.ext_links.append(ext_link) 

69 xarticle.title_tex = article_dict["title"] 

70 xarticle.fpage = article_dict["startPage"] or "" 

71 xarticle.lpage = article_dict["endPage"] or "" 

72 

73 if article_dict["keyword"] != "": 

74 for kwd in article_dict["keyword"].split(","): 

75 xarticle.kwds.append(create_subj(type="kwd", lang="en", value=kwd)) 

76 

77 if article_dict["mscValue"] != "": 

78 for msc in article_dict["mscValue"].split(","): 

79 xarticle.kwds.append(create_subj(type="msc", lang="en", value=msc)) 

80 

81 if article_dict["fAbstract"] != "": 

82 xarticle.abstracts.append( 

83 create_abstract(lang="en", tag="abstract", value_tex=article_dict["fAbstract"]) 

84 ) 

85 

86 for author in article_dict["authors"]: 

87 xarticle.contributors.append( 

88 create_contributor( 

89 string_name=author["fullName"], role="author", orcid=author["orcid"] 

90 ) 

91 ) 

92 return xarticle