获取用户最新视频
# 需求
获取用户最新视频信息(1000位用户)
# 分析
通过用户主页列表可获取相关信息
# 坑
- 用户多, 采用串行方式会比较慢, 所以使用并行方式
- 在使用puppeteer工具的时候, 因为访问控制的原因,会出现莫名的一些情况,造成没有数据返回, 造成程序暂停或异常,如在获取起签名的时候,没有处罚request事件,没有完成promise回调,增加了倒计时机制去解决问题
- 同一个useragent 访问频繁, puppeteer访问页面会空白, 什么都没有, 采用更好useragent的方式去兼容解决
- puppeteer内置了很多设备信息,刚好可以拿来用
# 相关代码
发起任务
export const userLatestVideo = async (day: number) => {
let videoCount = 0;
const page = 0;
const pageSize = 10;
const city = '池州';
let userCount = 0;
let deviceIndex = 0; // 设备顺序
const browserInstance = await getBrowserInstance();
try {
const loop = async (
city: string,
page: number,
pageSize: number
): Promise<any> => {
const { rows, count } = await cityUserList(city, Number(page), pageSize);
// const users = {
// rows: [{
// core_user_id: '84492244869'
// }],
// count: 1
// };
// const { rows, count } =users
userCount = count;
let chunkArr = [];
for (let i = 0; i < rows.length; i += 10) {
chunkArr.push(rows.slice(i, i + 10));
}
for (let chunk of chunkArr) {
const promiseAll: Promise<number>[] = [];
chunk.forEach((item: Record<string, any>) => {
deviceIndex = deviceIndex > devices.length - 1 ? 0 : deviceIndex;
promiseAll.push(
listByDay(deviceIndex, browserInstance, item.core_user_id, day)
);
deviceIndex += 1;
});
let result: number[] = [];
result = await Promise.all(promiseAll);
console.log('--------------------result', result);
result.forEach(item => (videoCount += item));
}
console.log('---page', page);
console.log('---pageSize', pageSize);
console.log('---count', count);
if (page * pageSize < count) {
return loop(city, page + 1, pageSize);
}
return videoCount;
};
await loop(city, page, pageSize);
} catch (e) {
log.error('userLatestVideo-error', e);
}
// await browserInstance.close();
return `共${userCount}人, ${videoCount}条记录`;
};
export const listByDay = async (
deviceIndex: number,
browserInstance: any,
uid: string,
day: number
): Promise<number> => {
try {
const result: Array<any> = [];
const currentiPhone = devices[deviceIndex];
const { name, userAgent } = currentiPhone;
const param = (await getRequestParam(browserInstance, uid, name)) as IList;
console.log('param', !!param);
if (!param) return Promise.resolve(0);
param.count = '2';
param.max_cursor = '0';
const dayBefor7 = moment().subtract('days', day - 1);
let timestamp = (param.timestamp = +dayBefor7);
const res = await loopList(param, result, userAgent);
console.log('res', res);
const data =
res &&
res
.filter(
(item: Record<string, any>) => Number(item.timestamp) > timestamp
)
.map((item: Record<string, any>) => {
return {
core_user_id: uid,
aweme_id: item.aweme_id,
aweme_type: item.aweme_type,
cover: item.video.cover && item.video.cover.url_list[0],
desc: item.desc || '标题为空',
duration: item.video.duration,
dynamicCover:
item.video.dynamic_cover && item.video.dynamic_cover.url_list[0],
height: item.video.height,
statistics: JSON.stringify(item.statistics),
create_time: formatDate('yyyy-MM-dd hh:mm:ss', item.timestamp),
vid: item.video.vid,
video_src: item.video.play_addr.url_list[0],
width: item.video.width,
play_count: item.statistics.play_count,
share_count: item.statistics.share_count,
comment_count: item.statistics.comment_count,
digg_count: item.statistics.digg_count,
forward_count: item.statistics.forward_count,
};
});
for (const item of data) {
console.log('item', item);
await addAuthorVideo(item);
}
return new Promise(resolve => {
resolve(data.length);
});
} catch (e) {
console.log(e);
return new Promise(resolve => {
resolve(0);
});
}
};
/**
* 循环视频列表,并汇总到一个数组里
* @param param 接口参数
* @returns result 数组集合
*/
const loopList = async (
param: IList,
result: Array<any>,
userAgent: string
): Promise<any> => {
try {
// 如果要查询的日期比数据库记录的日期要晚则不继续了,只查询最新记录
if (
param.max_cursor !== '0' &&
param.timestamp &&
param.timestamp > param.max_cursor
) {
return new Promise(resolve => {
resolve(result);
result = [];
});
}
await sleep(200);
const res = await getList(param, userAgent);
if (res.aweme_list && res.aweme_list.length) {
const list = res.aweme_list;
list[0].timestamp = res.min_cursor;
if (list[1]) list[1].timestamp = res.max_cursor;
result.push(...list);
return loopList(
{ ...param, max_cursor: res.max_cursor },
result,
userAgent
);
}
return new Promise(resolve => {
resolve(result);
result = [];
});
} catch (e) {
log.error('循环作品列表,发生错误:', e);
return new Promise(resolve => {
resolve(result);
result = [];
});
}
};
获取签名
export const getRequestParam = async (
browser: any,
uid: string,
deviceName: string
): Promise<any> => {
return new Promise(async resolve => {
try {
const blockTypes = new Set(['image', 'media', 'font']);
let data = {};
const page = await browser.newPage();
await page.emulate(devices[`${deviceName}`]);
await page.setRequestInterception(true);
page.goto(`https://www.iesdouyin.com/share/user/${uid}`);
// 拦截请求获取请求参数
page.on('request', async (interceptedRequest: any) => {
const type = interceptedRequest.resourceType();
const requestUrl = interceptedRequest.url();
if (requestUrl.indexOf(apiUrl) > -1) {
const qs = querystring.parse(requestUrl.split('?')[1]);
const {
user_id,
sec_uid,
count,
max_cursor,
aid,
_signature,
dytk,
} = qs;
const data = {
user_id,
sec_uid,
count,
max_cursor,
aid,
_signature,
dytk,
};
resolve(data);
} else {
const shouldBlock = blockTypes.has(type);
shouldBlock
? interceptedRequest.abort()
: interceptedRequest.continue();
}
});
page.on('load', async () => await page.close());
// 1s后无反应就结束
setTimeout(() => {
resolve();
}, 1000);
// 进入页面
} catch (e) {
console.log(`---${uid}---getRequestParam:`, e);
resolve();
}
});
# 总结
请求的人多且频繁,反扒措施做的越来越厉害,各种意向不到都有可能,nodejs需要写很多的异常处理代码及解决方案
上次更新: 2021/12/19, 18:05:42