I've tried get Youtube comments using Youtube Data API dan Python, but i still can't catch all the comments, as example a video have 36,000 comments but only catch 20,000 comments.
I tried it two ways, in this first way the comment retrieval process took a long time, would be faster if the ```if "nextPageToken" in response....`` is not use.
def get_comments(youtube, video_id, token): komentar = [] request = youtube.commentThreads().list(part='snippet, replies', videoId= video_id, maxResults = 10) response = request.execute() for idx, cmt in enumerate(response['items']): topLvl = dict( videoId = cmt['snippet']['videoId'], commentId = cmt['snippet']['topLevelComment']['id'], username = cmt['snippet']['topLevelComment']['snippet']['authorDisplayName'], comment = cmt['snippet']['topLevelComment']['snippet']['textDisplay'], likeCount = cmt['snippet']['topLevelComment']['snippet']['likeCount'], publishedAt = cmt['snippet']['topLevelComment']['snippet']['publishedAt']) komentar.append(topLvl) if 'replies' in response['items'][idx]: for v in range(len(response['items'][idx]['replies']['comments'])): replies1 = dict( video_id = cmt['replies']['comments'][v]['snippet']['videoId'], commentId = cmt['replies']['comments'][v]['id'], parentId = cmt['replies']['comments'][v]['snippet']['parentId'], username = cmt['replies']['comments'][v]['snippet']['authorDisplayName'], comment = cmt['replies']['comments'][v]['snippet']['textDisplay'], likeCount = cmt['replies']['comments'][v]['snippet']['likeCount'], publishedAt = cmt['replies']['comments'][v]['snippet']['publishedAt']) # print("rep 1: ", replies1) komentar.append(replies1) totalReplyCount = cmt['snippet']['totalReplyCount'] replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=cmt['id']).execute() for indx, reply in enumerate(replies_response['items']): # print("pp ", reply['snippet']['authorDisplayName']) replies2 = dict( # video_id = reply['videoId'], commentId = reply['id'], parentId = reply['snippet']['parentId'], username = reply['snippet']['authorDisplayName'], comment = reply['snippet']['textDisplay'], likeCount = reply['snippet']['likeCount'], publishedAt = reply['snippet']['publishedAt']) # print("rep 2: ", replies2) komentar.append(replies2) while "nextPageToken" in replies_response: token_reply = replies_response['nextPageToken'] replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=cmt['id'],pageToken=token_reply).execute() for indx, reply in enumerate(replies_response['items']): replies3 = dict( commentId = reply['id'], parentId = reply['snippet']['parentId'], username = reply['snippet']['authorDisplayName'], comment = reply['snippet']['textDisplay'], likeCount = reply['snippet']['likeCount'], publishedAt = reply['snippet']['publishedAt']) # print("rep 3: ", replies3) komentar.append(replies3) if "nextPageToken" in response: return get_comments(youtube, video_id, response['nextPageToken']) else: komentar = [x for x in komentar if len(x) > 0] return [] return komentarThis second method has many lines of code, even repetitive. But in this way can get more comment data
def video_comments(youtube, video_ids): all_comments = [] request = youtube.commentThreads().list(part='snippet, replies', videoId= video_id, maxResults = 100) response = request.execute() for i in range(0, len(response['items'])): data = dict( video_id = response['items'][i]['snippet']['videoId'], commentId = response['items'][i]['snippet']['topLevelComment']['id'], username = response['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName'], comment = response['items'][i]['snippet']['topLevelComment']['snippet']['textDisplay'], like = response['items'][i]['snippet']['topLevelComment']['snippet']['likeCount'], totalReply = response['items'][i]['snippet']['totalReplyCount'], publishedAt = response['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt']) if 'replies' in response['items'][i]: for v in range(len(response['items'][i]['replies']['comments'])): dataReplies = dict( video_id = response['items'][i]['replies']['comments'][v]['snippet']['videoId'], commentId = response['items'][i]['replies']['comments'][v]['id'], username = response['items'][i]['replies']['comments'][v]['snippet']['authorDisplayName'], comment = response['items'][i]['replies']['comments'][v]['snippet']['textDisplay'], like = response['items'][i]['replies']['comments'][v]['snippet']['likeCount'], publishedAt = response['items'][i]['replies']['comments'][v]['snippet']['publishedAt'] ) all_comments.append(dataReplies) all_comments.append(data) next_page_token = response.get('nextPageToken') more_pages = True while more_pages: if next_page_token is None: more_pages = False else: request = youtube.commentThreads().list(part='snippet, replies', videoId=video_id, maxResults = 100, pageToken = next_page_token) response = request.execute() for i in range(0, len(response['items'])): data = dict( video_id = response['items'][i]['snippet']['videoId'], commentId = response['items'][i]['snippet']['topLevelComment']['id'], username = response['items'][i]['snippet']['topLevelComment']['snippet']['authorDisplayName'], comment = response['items'][i]['snippet']['topLevelComment']['snippet']['textDisplay'], like = response['items'][i]['snippet']['topLevelComment']['snippet']['likeCount'], totalReply = response['items'][i]['snippet']['totalReplyCount'], publishedAt = response['items'][i]['snippet']['topLevelComment']['snippet']['publishedAt']) if 'replies' in response['items'][i]: for v in range(len(response['items'][i]['replies']['comments'])): dataReplies = dict( video_id = response['items'][i]['replies']['comments'][v]['snippet']['videoId'], commentId = response['items'][i]['replies']['comments'][v]['id'], username = response['items'][i]['replies']['comments'][v]['snippet']['authorDisplayName'], comment = response['items'][i]['replies']['comments'][v]['snippet']['textDisplay'], like = response['items'][i]['replies']['comments'][v]['snippet']['likeCount'], publishedAt = response['items'][i]['replies']['comments'][v]['snippet']['publishedAt'] ) all_comments.append(dataReplies) all_comments.append(data) next_page_token = response.get('nextPageToken') return (all_comments)Help me, this is my first time scraping data