# encoding: utf-8
# ruby: 2.4.2
=begin
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Copyright (c) 2018 Kevin Redon
=end
=begin
Rakefile to add revision control to 3GPP specifications.
Actions performed:
- download (missing) 3GPP specification .zip archives
- extract 3GPP specification .doc documents
- convert .doc documents to .xhtml documents
- extract text from .xhtml documents into .txt files
- pus 3GPP .txt specification versions in a git repository
=end
require 'rake'
require 'rake/clean'
require 'zip'
require 'open3'
require 'nokogiri'
# display all target steps
DEBUG = true
desc "download tike document converter"
task :tika => "tika-app.jar"
desc "download tike document converter"
file "tika-app.jar" do |t|
sh "wget --output-document=\"#{t.name}\" \"http://mirror.netcologne.de/apache.org/tika/tika-app-1.17.jar\""
end
desc "get latest 3GPP specifications"
task :download do
sh "wget --mirror --timeout=10 ftp://ftp.3gpp.org/Specs/archive/"
end
SPEC_DIR = "ftp.3gpp.org/Specs/archive"
file SPEC_DIR => :download
desc "initialize git repository"
task :git => ['.git', '.gitignore']
desc "exclude temporary files from git"
file '.gitignore' do |t|
File.open(t.name, "w") do |file|
file.puts "Rakefile"
file.puts "README"
file.puts "tika-app.jar"
file.puts ".listing"
file.puts "# 3GPP archives"
file.puts "*.zip"
file.puts "*.ZIP"
file.puts "# intermediate files"
file.puts "*.doc"
file.puts "*.xhtml"
file.puts "*.txt"
file.puts "!**/*.*.txt"
file.puts "# unused 3GPP file"
file.puts "*readme*.txt"
file.puts "*README.txt"
file.puts "*.DOC"
file.puts "*.docx"
file.puts "*.OLD"
file.puts "*.MAR"
file.puts "*.pdf"
file.puts "*.xsd"
file.puts "# specification leftovers"
file.puts "ftp.3gpp.org/Specs/archive/23_series/23.140/schema/"
file.puts "ftp.3gpp.org/Specs/archive/26_series/26.234/schema/"
file.puts "ftp.3gpp.org/Specs/archive/26_series/26.444/test_sequences/"
end
end
desc "initialize git repository"
file '.git' => '.gitignore' do
sh 'git init'
sh 'git config user.name "3GPP revision control script"'
sh 'git config user.email ""'
sh 'git add --force .gitignore'
sh 'git commit -m "ignore source 3GPP files"' # serves also as first rebase parent
end
# known malformed specifications (drafts, wrong numbering/versioning) not to put in the versions list
BAD_ARCHIVE = [
# wrong versioning
"ftp.3gpp.org/Specs/archive/30_series/30.802/30802-400error.zip",
"ftp.3gpp.org/Specs/archive/26_series/26.132/26132d001_rev.zip",
"ftp.3gpp.org/Specs/archive/26_series/26.115/26115d001.zip",
"ftp.3gpp.org/Specs/archive/26_series/26.975/26975d110.zip",
"ftp.3gpp.org/Specs/archive/50_series/50.099/50099-007duff.zip",
"ftp.3gpp.org/Specs/archive/03_series/03.20ext/0320ext-300'.zip",
"ftp.3gpp.org/Specs/archive/11_series/11.10-3/1110-3-4k1-.zip",
"ftp.3gpp.org/Specs/archive/11_series/11.10-3/1110-3-4k0-.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.914/25914-a20_invalid.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.331/25331vIntermediate.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.331/25331d310.zip",
"ftp.3gpp.org/Specs/archive/22_series/22.935/22935-110_ex_S1-050610.zip",
"ftp.3gpp.org/Specs/archive/22_series/22.935/22935-110_ex_S1-050431.zip",
# wrong numbering
"ftp.3gpp.org/Specs/archive/03_series/03.71/03071-7b0.zip",
"ftp.3gpp.org/Specs/archive/03_series/03.71/03071-890.zip",
"ftp.3gpp.org/Specs/archive/22_series/22.960/22945-100.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.212/25211-390.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.212/25211-430.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.832/25831-002.zip",
"ftp.3gpp.org/Specs/archive/26_series/26.973/26937-010.zip",
"ftp.3gpp.org/Specs/archive/33_series/33.833/33806-030.zip",
"ftp.3gpp.org/Specs/archive/25_series/25.925/25925_310.zip",
# unhandled multi-part specification
"ftp.3gpp.org/Specs/archive/05_series/05.10/0510-520a.zip",
"ftp.3gpp.org/Specs/archive/05_series/05.10/0510-520b.zip",
]
# 3GPP specification number (with some extensions)
# the 3GPP specification numbering comment "The first two digits define the series, followed by 2 further digits for the 01 to 13 series or 3 further digits for the 21 to 55 series." does not apply
SPECIFICATION_NUMBER_REGEXP = /(?\d{2})\.(?\d{2,3})(?dcs|ext|U)?(-(?\d{1,2})(-(?\d{1,2}))?)?/
# 3GPP specification number with version
SPECIFICATION_VERSION_REGEXP = /(?\d{2})\.?(?\d{2,3})(?dcs|ext|U|E)?(-(?\d{1,2})(-(?\d{1,2}))?)? ?[-_](?[0-9a-z]{3,6}) ?/i
# returns the (static) list of all specifications
# specification format: {
# - dir: path to directory containing all versions of a specification
# - versions: list of .txt document versions of this specification (correspond the specification version archives)
# - txt: TXT file for specification
# }
def specifications()
return @specs if @specs and !@specs.empty?
Rake::Task[SPEC_DIR].execute unless File.directory? SPEC_DIR
@specs = []
Dir.foreach(SPEC_DIR) do |series_file|
next if "."==series_file or ".."==series_file
series_path = "#{SPEC_DIR}/#{series_file}"
next unless File.directory? series_path
raise "unkown series format" unless series_file =~ /^\d{2}_series$/
Dir.foreach(series_path) do |specification_file|
next if "."==specification_file or ".."==specification_file
specification_path = "#{series_path}/#{specification_file}"
next unless File.directory? specification_path
number = specification_file.match(/^#{SPECIFICATION_NUMBER_REGEXP}$/)
unless number then
$stderr.puts "unkown number format: #{specification_file}"
next
end
versions = []
Dir.foreach(specification_path) do |archive_file|
next if "."==archive_file or ".."==archive_file
archive_path = "#{specification_path}/#{archive_file}"
next unless File.file? archive_path
next unless archive_file.end_with?(".zip") or archive_file.end_with?(".ZIP")
next if BAD_ARCHIVE.include? archive_path
version = archive_file.match(/^#{SPECIFICATION_VERSION_REGEXP}\.zip$/i)
unless version then
$stderr.puts "unknown version format: #{archive_file}"
next
end
unless number[:series].to_i==version[:series].to_i and number[:mantissa].to_i==version[:mantissa].to_i then
$stderr.puts "specification numbers do not match: #{archive_path}"
next
end
versions << archive_path.gsub(/zip$/i,"txt")
end
@specs << {dir: specification_path, txt: "#{specification_path}/#{specification_file}.txt", versions: versions}
end
end
return @specs
end
desc "generate latest text specifications"
task :default => specifications.collect {|spec| spec[:txt]} do |t|
# puts t.prerequisites
end
# convert a month string into its 2 digit string number
def month2num(month)
num = case month
when /jan/i
"01"
when /feb/i
"02"
when /mar/i
"03"
when /apr/i
"04"
when /may/i
"05"
when /jun/i
"06"
when /jul/i
"07"
when /aug/i
"08"
when /sep/i
"09"
when /oct/i
"10"
when /nov/i
"11"
when /dec/i
"12"
else
"00"
end
return num
end
class Version
attr_reader :parts
@parts = nil
def initialize(version)
@parts = case version
when /^\d+\.\d+.\d+$/ # extracted from document
match = version.match /^(?\d+)\.(?\d+)\.(?\d+)$/
[match[:part1].to_i, match[:part2].to_i, match[:part3].to_i]
when /^[0-9a-z]{3}$/i # extracted from file name
[version[0,1].to_i(36), version[1,1].to_i(36), version[2,1].to_i(36)]
when /^\d{6}$/ # extracted from file name
[version[0,2].to_i, version[2,2].to_i, version[4,2].to_i]
else
nil
end
raise "unknown version format: #{version}" unless @parts
end
def <=>(other)
if @parts[0]other.parts[0] then
return 1
elsif @parts[1]other.parts[1] then
return 1
elsif @parts[2]other.parts[2] then
return 1
else
return 0
end
end
def >(other)
return (self<=>other)>0
end
def <(other)
return (self<=>other)<0
end
def ==(other)
return (self<=>other)==0
end
def !=(other)
return (self<=>other)!=0
end
def to_s
return "#{@parts[0]}.#{@parts[1]}.#{@parts[2]}"
end
end
# use the document file name to get specification number and version when we could not extract if from the document itself or the extracted information does not match (the date remains unknown)
# they quite often forget to update the version number in the documents, or do it wrong
USE_FILENAME_VERSION = true
# enforce the version number for the following files (useful when it can't be extracted or the extracted information is wrong)
FILENAME_VERSION = {
"ftp.3gpp.org/Specs/archive/01_series/01.04/0104-502.txt" => {series: "01", mantissa: "04", version: Version.new("5.0.2"), date: "2001-10"},
}
desc "add version control on text specification"
rule /#{SPECIFICATION_NUMBER_REGEXP}\.txt$/ => ["git", proc{|f| specifications.collect {|spec| f==spec[:txt] ? spec[:versions] : nil}.flatten.compact}] do |t|
puts "(re-)generating speficiation #{t.name} (including all versions)" if DEBUG
# get the spec
spec = specifications.select {|spec| t.name==spec[:txt]}
raise "specification for #{t.name} not found" if spec.empty?
spec = spec[0]
t.prerequisites.shift # remove git requirement
# verify is any of the dependency is newer
next if File.file?(t.name) and t.prerequisites.select{|txt| File.mtime(txt)>File.mtime(t.name)}.compact.empty?
# collect specification version information
versions = []
spec[:versions].each do |txt|
next unless File.size? txt
# get version number from file name
file = File.basename(txt)
match = file.match(/#{SPECIFICATION_VERSION_REGEXP}\.txt$/i)
if !match then
$stderr.puts "malformated specification version number: #{file}"
next
end
file_version = {series: match[:series], mantissa: match[:mantissa], other: match[:other], part: match[:part], subpart: match[:subpart], version: Version.new(match[:version])}
# get version number from document content
doc_version = FILENAME_VERSION[txt] # get enforced version
text = IO.read(txt) # get document content
unless doc_version then # search for current aa.bbb Vc.d.e (yyyy-mm) format
text.each_line do |line|
if line =~ /^(GSM)|(UMTS)|(3G)|(3GPP)|(TS)/ then
match = line.match(/(?\d{2})\.(?\d{2,3})(?dcs|ext|E|U)?(-(?\d{1,2})(-(?\d{1,2}))?)?\s*V(?\d+\.\d+.\d+)\s*\((?\d{4}-\d{1,2})\)/i)
if match then
doc_version = {series: match[:series], mantissa: match[:mantissa], other: match[:other], part: match[:part], subpart: match[:subpart], version: Version.new(match[:version]), date: match[:date]}
break
end
match = line.match(/(?\w+)\W+(?\d{4})\s*\(.*(?\d{2})\.(?\d{2,3})(?dcs|ext|E|U)?(-(?\d{1,2})(-(?\d{1,2}))?)?\s*(V|version)\s*(?\d+\.\d+.\d+)\s*\)/i)
if match then
doc_version = {series: match[:series], mantissa: match[:mantissa], other: match[:other], part: match[:part], subpart: match[:subpart], version: Version.new(match[:version]), date: match[:year]+"-"+month2num(match[:month])}
break
end
match = line.match(/(?\d{2})\.(?\d{2,3})\W*(V|version)\W*(?\d+\.\d+.\d+)\W*(?\w+)\W+(?\d{4})/i)
if match then
doc_version = {series: match[:series], mantissa: match[:mantissa], version: Version.new(match[:version]), date: match[:year]+"-"+month2num(match[:month])}
break
end
end
end
end
# get version from file name if yet unknown and desired
doc_version = file_version if !doc_version and USE_FILENAME_VERSION
# make version verifications
if !doc_version then
$stderr.puts "could not find specification number in #{txt}"
next
end
puts "#{txt} #{doc_version[:series]}.#{doc_version[:mantissa]} version: #{doc_version[:version]}#{doc_version[:date] ? ' ('+doc_version[:date].to_s+')' : ''}" if DEBUG
[:version, :series, :mantissa].each do |property|
if !doc_version[property] then
$stderr.puts "could not find #{property} in #{txt}"
next
end
if file_version[property] != doc_version[property] then
$stderr.puts "#{property} #{file_version[property]} from #{txt} file does not match #{property} #{doc_version[property]} in document"
if USE_FILENAME_VERSION then
doc_version[property] = file_version[property]
else
next
end
end
end
[:other, :part, :subpart].each do |property|
doc_version[property] ||= file_version[property]
if file_version[property] != doc_version[property] then
$stderr.puts "#{property} #{file_version[property]} from #{txt} file does not match #{property} #{doc_version[property]} in document"
if USE_FILENAME_VERSION then
doc_version[property] = file_version[property]
else
next
end
end
end
# save version information
doc_version[:file] = txt
versions << doc_version
end
# create empty file if no version is present
if versions.empty? then
sh "touch #{t.name}"
next
end
# sort specification versions based on version parts
versions.sort! {|a,b| a[:version]<=>b[:version]}
# check if we just need to update the specification, or regenerate it from scratch
regenerate = false
target_version = nil
if File.file?(t.name) then
if 0==File.size(t.name) then
regenerate = true
else
# sort between version older of newer than target specification
older = []
newer = []
versions.each do |version|
if File.mtime(version[:file]) > File.mtime(t.name) then
newer << version
else
older << version
end
end
# find which of the older specification corresponds the current target specification
older.reverse.each do |version|
if File.size(version[:file])==File.size(t.name) and IO.read(version[:file])==IO.read(t.name) then
target_version = version
break
end
end
if !target_version then # unknown current target specification version
regenerate = true
else
# ensure current target version has correct date
older.each do |version|
regenerate = true if version[:version] > target_version[:version]
end
newer.each do |version|
regenerate = true unless version[:version] > target_version[:version]
end
if !regenerate then
versions = newer # only commit newer files
end
end
end
end
# restart specification regeneration if it's just an update
if regenerate then
# remove file commits from git
if system("git ls-files --error-unmatch #{t.name} &> /dev/null") then
sh "git rm --force #{t.name}"
sh "git commit --message='remove #{t.name} before regenerating specification'"
`git log --oneline -- #{t.name}`.each_line do |line|
hash = line.split(" ")[0]
sh "git rebase --preserve-merges --onto #{hash}^ #{hash}"
end
end
end
# put versions in versioning system
if versions.empty? then
puts "no new specification version found for #{t.name}" if DEBUG
end
versions.each_index do |i|
version = versions[i]
cp version[:file], t.name
# commit version
sh "git add #{t.name}"
spec_name = "#{version[:series]}.#{version[:mantissa]}"
spec_name += version[:other] if version[:other]
spec_name += "-"+version[:part] if version[:part]
spec_name += "-"+version[:subpart] if version[:subpart]
if 0==i then
if !target_version then
sh "git commit --message='initial #{spec_name} version #{version[:version]}#{version[:date] ? ' ('+version[:date]+')' : ''}'"
else
sh "git commit --allow-empty --message='#{spec_name} version #{target_version[:version]}#{target_version[:date] ? ' ('+target_version[:date]+')' : ''} -> #{version[:version]}#{version[:date] ? ' ('+version[:date]+')' : ''}'"
end
else
sh "git commit --allow-empty --message='#{spec_name} version #{versions[i-1][:version]}#{versions[i-1][:date] ? ' ('+versions[i-1][:date]+')' : ''} -> #{version[:version]}#{version[:date] ? ' ('+version[:date]+')' : ''}'"
end
sh "git tag '#{spec_name}v#{version[:version]}'"
end
end
desc "convert all .xhtml speficification documents into .txt text"
task :text => specifications.collect {|spec| spec[:versions].collect{|version| version}}.flatten
desc "convert .xhtml document to .txt text"
rule '.txt' => '.xhtml' do |t|
puts "converting #{t.prerequisites[0]} to #{t.name}" if DEBUG
File.open(t.name, "w") do |file|
next unless File.size? t.prerequisites[0]
# open XHTML
doc = Nokogiri::XML(File.open(t.prerequisites[0]))
doc.remove_namespaces!
# remove title since it's often wrong
doc.xpath("//head/title").each do |title|
title.remove
end
# remove page number from table of content
doc.xpath("//p[contains(@class,'tOC_')]").each do |toc|
toc.traverse do |node|
if node.text? then
node.content = node.content.gsub(/\s+(\d+)?$/,'')
if node.content.match(/^\d+[[:alpha:]]/) then
node.content = node.content.gsub(/^\d+/,'')
end
end
end
end
# remove images
doc.xpath("//img").each do |img|
if img["src"] and img["src"].end_with?(".wmf") then
img.parent.remove
else
img.remove
end
end
# remove embedded diagrams
doc.xpath("//div[contains(@class,'embedded')]").each do |div|
div.remove
end
doc.xpath("//div[contains(@class,'package-entry')]").each do |div|
div.remove
end
# convert to text
text = doc.text
# remove heading spaces
text.gsub!(/^[[:blank:]]+/,"")
# remove tailing spaces
text.gsub!(/[[:blank:]]+$/,"")
# remove multiple newlines
text.gsub!(/(\n){2,}/,"\n\n")
# remove first empty line
text = text[1..-1] while text[0]=="\n"
# fix bullet points
text.gsub!(/^-\n/,"- ")
text.gsub!(/\n\n- /,"\n- ")
file.write text
end
end
desc "convert all .doc speficification documents into .xhtml documents"
task :convert => specifications.collect {|spec| spec[:versions].collect{|version| version.ext("xhtml")}}.flatten
# doc files not to convert because they are not supported or broken
DOC_NOCONVERT = [
"ftp.3gpp.org/Specs/archive/02_series/02.22/0222-110.doc",
"ftp.3gpp.org/Specs/archive/02_series/02.95/0295-010.doc",
"ftp.3gpp.org/Specs/archive/22_series/22.016/22016-a00.doc",
"ftp.3gpp.org/Specs/archive/24_series/24.008/24008-530.doc",
"ftp.3gpp.org/Specs/archive/25_series/25.321/25321-200.doc",
"ftp.3gpp.org/Specs/archive/31_series/31.122/31122-100.doc",
]
desc "convert .doc specification document to .xhtml document"
rule '.xhtml' => ["tika-app.jar", '.doc'] do |t|
puts "converting #{t.prerequisites[1]} to #{t.name}" if DEBUG
xhtml = File.open(t.name, "w") # open empty document
unless 0==File.size(t.prerequisites[1]) or DOC_NOCONVERT.include?(t.prerequisites[1]) then # doc is not empty and can be converted
stdout, stderr, status = Open3.capture3("java" ,"-jar", t.prerequisites[0], "--pretty-print", "--xml", t.prerequisites[1])
if 0==status.exitstatus then
xhtml.write stdout
elsif stderr.include? "The supplied data appears to be in the old MS Write format" then
puts "can't convert #{t.prerequisites[1]} to XHTML: old MS Write format is not supported"
else
puts "conversion from #{t.prerequisites[1]} to #{t.name} failed"
end
end
xhtml.close
end
desc "extract all .doc speficification documents from .zip archives"
task :extract => specifications.collect {|spec| spec[:versions].collect{|version| version.ext("doc")}}.flatten
# the actual file in the archive matching to the expected file
DOC_FILENAME = {
"48103-132.doc" => "GP-081959 New AUP TS 48.103 v132.doc",
"48103-130.doc" => "GP-081880 New AUP TS 48.103 v130.doc",
"48103-100.doc" => "GP-081422 New AUP TS v003.doc",
"48103-131.doc" => "GP-081937 New AUP TS 48.103 v131.doc",
"28601-010.doc" => "28601-010 CN and WLAN Interworking system NRM IRP Requirements_clean.doc",
"28900-010.doc" => "S5-176535_28.9bc-010.doc",
"29846-030 .doc" => "29846-030 .doc",
"21915-000.doc" => "21915-001.doc",
"0276-801.doc" => "0276_800.doc",
"45820-210.doc" => "TR 45 820v210_approved.doc",
"45820-200.doc" => "TR 45 820v200_clean.doc",
"45820-121.doc" => "TR 45 820 v121_clean.doc",
"45820-100.doc" => "TR 45 820 v100 clean.doc",
"45820-140.doc" => "TR 45 820v140_clean.doc",
"28800-200.doc" => "28800-100.doc",
"44318-010.doc" => "TS44_318_v_0_1_0.doc",
"44933-050.doc" => "TR Seamless Support of Streaming v050.doc",
"44933-060.doc" => "TR Seamless Support of Streaming v060 1.doc",
"45926-010.doc" => "GP-111399 BTS Energy Savings v010_clean.doc",
"29228-d30.doc" => "29128-d30.doc",
"29280-100.doc" => "29xyz-100.doc",
"29234-150.doc" => "29.234_V1.5_clean.doc",
"29234-170.doc" => "29.234_V1.7CLEAN.doc",
"29279-100.doc" => "29.xyz-100.doc",
"29139-100.doc" => "29139-020.doc",
"29281-100.doc" => "29xyz-100.doc",
"29198-13-001.doc" => "120091_13_v001.doc",
"33401-100.doc" => "33abc-100.doc",
"33833-040.doc" => "33803-040.doc",
"26347-031.doc" => "TS26 347v031-clean.doc",
"26347-021.doc" => "TS26 347v021.doc",
"26347-020.doc" => "TS26 347v020.doc",
"26347-010.doc" => "TS26 347v010.doc",
"32584-100.doc" => "32584-020_clean.doc",
"34114-100.doc" => "TS34 114_100.doc",
"25927-100.doc" => "TR25.927.doc",
"25809-100.doc" => "TR25_809_v100.doc",
"25809-200.doc" => "TR25_809_v200.doc",
"25809-052.doc" => "TR25_809_v052.doc",
"25822-100.doc" => "UMTS700_TR-25_822_100.doc",
"25990-014.doc" => "25990-004.doc",
"23722-001.doc" => "23.722-d000.doc",
"23781-011.doc" => "TR_23.781_011_cl.docx",
"23781-020.doc" => "23.781_020_cl.docx",
"23810-0a0.doc" => "23810-0100-clean.doc",
"23858-010.doc" => "TR23.856v0.1.0_cl.doc",
"23858-020.doc" => "TR23.856v0.2.0_cl.doc",
"23882-0a1.doc" => "23882-0100-clean.doc",
"23917-0a0.doc" => "TR 23.917 v0100 clean.doc",
"36839-011.doc" => "TR36 839v011.doc",
"36839-021.doc" => "TR36 839v021.doc",
"36853-100.doc" => "RP-140857_TR36 853_V100.doc",
"36853-200.doc" => "RP-141217_TR36 853_V200.doc",
"36853-220.doc" => "RP-141806_TR36 853_V220.doc",
"36856-100.doc" => "RP-140856_TR36 856_V100.doc",
"36889-021.doc" => "TR 36 889 v0 2 1.doc",
"36889-030.doc" => "TR 36 889 v0 3 0.doc",
"36942-200.doc" => "RP-080592_TR36_942_v200.doc",
"38141-1-001.doc" => "R4-1711982 TS 38.141-1 v0.0.1.docx",
"38141-1-010.doc" => "R4-1803913 TS 38.141-1 v0.1.0(clean).docx",
"38141-2-001.doc" => "R4-1711983 TS 38.141-2 v0.0.1.docx",
"38331-040.doc" => "TS38331-040 cl.docx",
"38816-001.doc" => "TR 38.816 V010_clean.doc",
}
desc "extract .doc specification document from .zip archive"
rule '.doc' => proc{|f| File.file?(f.ext("zip")) ? f.ext("zip") : f.ext("ZIP")} do |t|
puts "extracting #{t.name} from #{t.prerequisites[0]}" if DEBUG
# parse spec number
doc = File.basename(t.name)
number = doc.match(/^(?\d{2})\.?(?\d{2,3})(?dcs|ext|E|U)?(-(?\d{1,2})(-(?\d{1,2}))?)?( )?[-_](?[0-9a-z]{3,6})( )?\.doc$/i)
if number then
# open archive
zip = Zip::File.open(t.prerequisites[0])
# find doc file in archive
zip_file = zip.entries.select {|file| file.name =~ /#{number[:series]}\.?#{number[:mantissa]}(#{number[:other]})?[-_A-Z]?#{number[:version]}([-_]cl(ean)?)?\.doc(x)?$/i}
zip_file = zip.entries.select {|file| file.name =~ /#{number[:series]}\.?#{number[:mantissa]}.*#{number[:version][0]}.*#{number[:version][1]}.*#{number[:version][2]}.*\.doc$/i} if zip_file.empty?
zip_file = zip.entries.select {|file| file.name==DOC_FILENAME[doc]} if zip_file.empty?
if zip_file.empty? then # doc file not found in archive
$stderr.puts "#{t.prerequisites[0]} does not include #{doc}: #{zip.entries.collect{|f| f.name}.to_s}"
File.open(t.name, "w") # create empty file
else # extract doc file
zip_file.first.extract(t.name)
end
else
puts "could not find number for #{doc}"
end
end
desc "remove bogus specifications (e.g. malformated files)"
task :clean_bogus do
specifications.each do |spec|
if File.file?(spec[:txt]) and 0==File.size(spec[:txt]) then
puts "removing bogus #{spec[:txt]}"
File.delete(file)
end
spec[:versions].each do |version|
[version, version.ext("doc")].each do |file|
if File.file?(file) and 0==File.size(file) then
puts "removing bogus #{file}"
File.delete(file)
end
end
end
end
end
# return the total count and size of zip archives in the directory
def zip_size(dir)
count = 0
size = 0
return [count,size] unless File.directory? dir
Dir.foreach(dir) do |file|
next if "."==file or ".."==file
path = "#{dir}/#{file}"
if File.file? path then
if file.downcase.end_with?(".zip") then
count += 1
size += File.size(path)
end
elsif File.directory? path then
rc = zip_size(path)
count += rc[0]
size += rc[1]
end
end
return [count,size]
end
desc "counts the number of files and total size"
task :stats do
count, size = zip_size(SPEC_DIR)
puts "number of .zip 3GPP archives: #{count}"
puts "total .zip 3GPP archives size: #{((size.to_f/1000/1000).to_i.to_f)/1000} GB"
puts "number of specifications: #{specifications.size}"
count = 0
specifications.each {|spec| count += spec[:versions].size}
puts "number of .zip specification archives: #{count}"
size = 0
specifications.each do |spec|
spec[:versions].each do |version|
zip = (File.file?(version.ext("zip")) ? version.ext("zip") : version.ext("ZIP"))
size += File.size(zip)
end
end
puts "total .zip specification archives size: #{((size.to_f/1000/1000).to_i.to_f)/1000} GB"
[".doc", ".xhtml", ".txt"].each do |extension|
count = 0
bogus = 0
size = 0
specifications.each do |spec|
spec[:versions].each do |version|
next unless File.file?(version.ext(extension))
(0==File.size(version.ext(extension))) ? (bogus += 1) : (count += 1)
size += File.size(version.ext(extension))
end
end
puts "number of #{extension} specification documents: #{count} (+ #{bogus} bogus)"
puts "total #{extension} specification documents size: #{((size.to_f/1000/1000).to_i.to_f)/1000} GB"
end
end
CLEAN.include(specifications.collect {|spec| spec[:versions].collect{|version| version.ext("txt")}}.flatten) # .txt files
CLEAN.include(specifications.collect {|spec| spec[:versions].collect{|version| version.ext("xhtml")}}.flatten) # .xhtml files
CLEAN.include(specifications.collect {|spec| spec[:versions].collect{|version| version.ext("doc")}}.flatten) # .doc files
CLOBBER.include(specifications.collect {|spec| spec[:txt]}) # text specifications
CLOBBER.include(".gitignore")
CLOBBER.include(".git")