program Amazon_Books;
const
cMaxCategory = 5; // total number of categories; set to 10;
cMaxBooksPerCat = 500; // how many books to extract per category; set to 500
cMaxBooks = 5000; // maximum number of books to extract
cOutputFileBooks = 'C:\AmazonBooks.csv';
cReviewInputFile = 'C:\AmazonTitleURL.csv';
type
BookRecord = record
sDetailURL : string;
sTitle : string;
sListPrice : string;
sPrice : string;
sCustomer1 : string;
sCustomer2 : string;
sCustomer3 : string;
sCustomer4 : string;
sCustomer5 : string;
sPublishDate : string;
sCustomerReview : string;
sSalesRank : string;
end;
var
nCategoryCount, nBookCount, nPage, nIndex, nBooksInCategory : integer;
ArrayCategoryURL: array[1..10] of string; // cMaxCategory
ArrayBooks: array[1..cMaxBooks] of BookRecord;
function ProcessDblQuote(str: string): string;
var
bFlag : boolean;
begin
result := str;
bFlag := False;
if (Pos('"', str) > 0) then
begin
str := StringReplace(str, '"', '""', true, true);
bFlag := True;
end;
if (Pos(#13, str) > 0) or (Pos(',', str) > 0) then
bflag := True;
if (bFlag) then // has double quote or #13
result := '"' + str + '"';
end;
// Assign URL of the first page search reesult of each of 10 categories here, so that script can just go directly to it
procedure InitCategoryURL;
begin
// 1) Arts & Photography->General
ArrayCategoryURL[1] := 'http://www.amazon.com/General-Art-Arts-Photography-Books/b/ref=amb_link_5/104-6764641-2666339?%5Fencoding=UTF8&node=779552';
// 2) Business & Investing->General
ArrayCategoryURL[2] := 'http://www.amazon.com/General-Business-Investing-Books/b/ref=amb_link_9/104-6764641-2666339?%5Fencoding=UTF8&node=2612';
// 3) Children's Books->Educational
ArrayCategoryURL[3] := 'http://www.amazon.com/Educational-Childrens-Books/b/ref=amb_link_9/104-6764641-2666339?%5Fencoding=UTF8&node=69800';
// 4) Literature & Fiction->General
ArrayCategoryURL[4] := 'http://www.amazon.com/General-Literature-Fiction-Books/b/ref=amb_link_9/104-6764641-2666339?%5Fencoding=UTF8&node=10125';
// 5) Mystery & Thrillers->General
ArrayCategoryURL[5] := 'http://www.amazon.com/General-Mystery-Thrillers-Books/b/ref=amb_link_3/104-6764641-2666339?%5Fencoding=UTF8&node=605116';
// 6) Health, Mind, & Body->General
ArrayCategoryURL[6] := 'http://www.amazon.com/General-Health-Mind-Body-Books/b/ref=amb_link_11/104-6764641-2666339?%5Fencoding=UTF8&node=298657';
// 7) Nonfiction->Philosophy
ArrayCategoryURL[7] := 'http://www.amazon.com/Philosophy-Nonfiction-Books/b/ref=amb_link_11/104-6764641-2666339?%5Fencoding=UTF8&node=11019';
// 8) Outdoors & Nature->Travel
ArrayCategoryURL[8] := 'http://www.amazon.com/Travel-Outdoors-Nature-Books/b/ref=amb_link_15/104-6764641-2666339?%5Fencoding=UTF8&node=290127';
// 9) Reference->General
ArrayCategoryURL[9] := 'http://www.amazon.com/General-Reference-Books/b/ref=amb_link_15/104-6764641-2666339?%5Fencoding=UTF8&node=408268';
// 10) Biography & Memoirs->People, A-Z
ArrayCategoryURL[10] := 'http://www.amazon.com/People-Biographies-Memoirs-Books/b/ref=amb_link_10/104-6764641-2666339?%5Fencoding=UTF8&node=916928';
end;
procedure GetTitleAndURL(str : string);
var
nPos : integer;
sTitle, sUrl : string;
sMarker, sUrlStart, sUrlEnd, sTitleStart, sTitleEnd : string;
begin
sMarker := '
';
sUrlStart := ' | ';
sTitleStart := '';
sTitleEnd := ''
nPos := Pos(sMarker, str);
while (nPos > 0) do
begin
Delete(str, 1, nPos);
sTitle := ProcessDblQuote(ExtractSubStr(str, sTitleStart, sTitleEnd));
sUrl := 'http://www.amazon.com/' + ExtractSubStr(str, sUrlStart, sUrlEnd);
// extract only maximum desired
if (nBooksInCategory >= cMaxBooksPerCat) then
Exit;
nBookCount := nBookCount + 1;
nBooksInCategory := nBooksInCategory + 1;
ArrayBooks[nBookCount].sTitle := sTitle;
ArrayBooks[nBookCount].sDetailURL := sUrl;
nPos := Pos(sMarker, str);
end;
end;
procedure GetSimilarBooks(str: string);
var
nPos : integer;
sTitle, sTmp, sStart, sEnd : string;
begin
sTmp := ExtractSubStr(str, '', ' ');
sStart := '">';
sEnd := ' ';
// first
nPos := Pos(sEnd, sTmp);
sTitle := ProcessDblQuote(ExtractSubStr(sTmp, sStart, sEnd));
if not(IsPartOf('Explore similar items', sTitle)) then
ArrayBooks[nIndex].sCustomer1 := sTitle;
Delete(sTmp, 1, nPos+4);
// second
nPos := Pos(sEnd, sTmp);
sTitle := ProcessDblQuote(ExtractSubStr(sTmp, sStart, sEnd));
if not(IsPartOf('Explore similar items', sTitle)) then
ArrayBooks[nIndex].sCustomer2 := sTitle
else
Exit;
Delete(sTmp, 1, nPos+4);
// third
nPos := Pos(sEnd, sTmp);
sTitle := ProcessDblQuote(ExtractSubStr(sTmp, sStart, sEnd));
if not(IsPartOf('Explore similar items', sTitle)) then
ArrayBooks[nIndex].sCustomer3 := sTitle
else
Exit;
Delete(sTmp, 1, nPos+4);
// fourth
nPos := Pos(sEnd, sTmp);
sTitle := ProcessDblQuote(ExtractSubStr(sTmp, sStart, sEnd));
if not(IsPartOf('Explore similar items', sTitle)) then
ArrayBooks[nIndex].sCustomer4 := sTitle
else
Exit;
Delete(sTmp, 1, nPos+4);
// fifth
nPos := Pos(sEnd, sTmp);
sTitle := ProcessDblQuote(ExtractSubStr(sTmp, sStart, sEnd));
if not(IsPartOf('Explore similar items', sTitle)) then
ArrayBooks[nIndex].sCustomer5 := sTitle
else
Exit;
Delete(sTmp, 1, nPos+4);
end;
procedure GetBookDetails(str: string);
var
sTmp, sLine : string;
begin
ArrayBooks[nIndex].sListPrice := Trim(ExtractSubStr(str, ' | ', ' | '));
ArrayBooks[nIndex].sPrice := Trim(ExtractSubStr(str, '', ''));
GetSimilarBooks(str);
sTmp := Trim(ExtractSubStr(str, 'Publisher:', ''));
ArrayBooks[nIndex].sPublishDate := ProcessDblQuote(ExtractSubStr(sTmp, '(', ')'));
sTmp := Trim(ExtractSubStr(str, '/customer-reviews/stars-', '.gif"'));
sTmp := StringReplace(sTmp, '-', '.', True, True);
ArrayBooks[nIndex].sCustomerReview := sTmp;
sTmp := ProcessDblQuote(Trim(ExtractSubStr(str, 'Amazon.com Sales Rank:', 'in Books')));
ArrayBooks[nIndex].sSalesRank := StringReplace(sTmp, '#', '', True, True);
// save to file
sLine := ArrayBooks[nIndex].sTitle + ',' +
ArrayBooks[nIndex].sListPrice + ',' +
ArrayBooks[nIndex].sPrice + ',' +
ArrayBooks[nIndex].sCustomer1 + ',' +
ArrayBooks[nIndex].sCustomer2 + ',' +
ArrayBooks[nIndex].sCustomer3 + ',' +
ArrayBooks[nIndex].sCustomer4 + ',' +
ArrayBooks[nIndex].sCustomer5 + ',' +
ArrayBooks[nIndex].sPublishDate + ',' +
ArrayBooks[nIndex].sCustomerReview + ',' +
ArrayBooks[nIndex].sSalesRank;
// save to output file
WriteToLog(cOutputFileBooks, '', sLine);
// save to book review input file
WriteToLog(cReviewInputFile, '', ArrayBooks[nIndex].sTitle + ',' + ArrayBooks[nIndex].sDetailURL);
end;
procedure OnDocumentComplete(URL : string);
begin
if (ArrayBooks[nIndex].sDetailURL = URL) then // book details
begin
GetBookDetails(GetHTMLBody);
// goto next book detail
if (nIndex < nBookCount) then
begin
nIndex := nIndex + 1;
GotoURL(ArrayBooks[nIndex].sDetailURL);
end
// script end
else
begin
NewbieScriptEnd;
ShowMessage('Data extraction completed. '+cOutputFileBooks);
end;
end
else if IsPartOf(ArrayCategoryURL[nCategoryCount], URL) or IsPartOf('page='+IntToStr(nPage), URL) then // first page or succeeding pages, eg. page=3
begin
GetTitleAndURL(GetTableCellURLS(15, 0, 0));
// goto next page
if (nBooksInCategory < cMaxBooksPerCat) and HyperLinkExists(IntToStr(nPage+1)) then
begin
nPage := nPage + 1;
ClickHyperLink(IntToStr(nPage))
end
// goto next book category
else if (nCategoryCount < cMaxCategory) then
begin
//writetolog('c:\zzz.csv', '', '');
nPage := 1;
nBooksInCategory := 0;
nCategoryCount := nCategoryCount + 1;
GotoURL(ArrayCategoryURL[nCategoryCount]);
end
// now proceed to get book details
else if (nBookCount > 0) then
begin
GotoURL(ArrayBooks[nIndex].sDetailURL);
end;
end
else if IsPartOf('http://www.amazon.com', URL) then
begin
GotoURL(ArrayCategoryURL[nCategoryCount]);
end;
end;
begin
nPage := 1;
nCategoryCount := 1;
nIndex := 1;
nBookCount := 0;
nBooksInCategory := 0;
InitCategoryURL;
DeleteFile(cOutputFileBooks);
DeleteFile(cReviewInputFile);
Navigate('http://www.amazon.com');
end.